interval.rs source code [crates/regex-syntax-0.6.29/src/hir/interval.rs]

1	use std::char;
2	use std::cmp;
3	use std::fmt::Debug;
4	use std::slice;
5	use std::u8;
6
7	use crate::unicode;
8
9	// This module contains an internal* implementation of interval sets.*
10	//
11	// The primary invariant that interval sets guards is canonical ordering. That
12	// is, every interval set contains an ordered sequence of intervals where
13	// no two intervals are overlapping or adjacent. While this invariant is
14	// occasionally broken within the implementation, it should be impossible for
15	// callers to observe it.
16	//
17	// Since case folding (as implemented below) breaks that invariant, we roll
18	// that into this API even though it is a little out of place in an otherwise
19	// generic interval set. (Hence the reason why the `unicode` module is imported
20	// here.)
21	//
22	// Some of the implementation complexity here is a result of me wanting to
23	// preserve the sequential representation without using additional memory.
24	// In many cases, we do use linear extra memory, but it is at most 2x and it
25	// is amortized. If we relaxed the memory requirements, this implementation
26	// could become much simpler. The extra memory is honestly probably OK, but
27	// character classes (especially of the Unicode variety) can become quite
28	// large, and it would be nice to keep regex compilation snappy even in debug
29	// builds. (In the past, I have been careless with this area of code and it has
30	// caused slow regex compilations in debug mode, so this isn't entirely
31	// unwarranted.)
32	//
33	// Tests on this are relegated to the public API of HIR in src/hir.rs.
34
35	#[derive(Clone, Debug, Eq, PartialEq)]
36	pub struct IntervalSet<I> {
37	ranges: Vec<I>,
38	}
39
40	impl<I: Interval> IntervalSet<I> {
41	/// Create a new set from a sequence of intervals. Each interval is
42	/// specified as a pair of bounds, where both bounds are inclusive.
43	///
44	/// The given ranges do not need to be in any specific order, and ranges
45	/// may overlap.
46	pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
47	let mut set = IntervalSet { ranges: intervals.into_iter().collect() };
48	set.canonicalize();
49	set
50	}
51
52	/// Add a new interval to this set.
53	pub fn push(&mut self, interval: I) {
54	// TODO: This could be faster. e.g., Push the interval such that
55	// it preserves canonicalization.
56	self.ranges.push(interval);
57	self.canonicalize();
58	}
59
60	/// Return an iterator over all intervals in this set.
61	///
62	/// The iterator yields intervals in ascending order.
63	pub fn iter(&self) -> IntervalSetIter<'_, I> {
64	IntervalSetIter(self.ranges.iter())
65	}
66
67	/// Return an immutable slice of intervals in this set.
68	///
69	/// The sequence returned is in canonical ordering.
70	pub fn intervals(&self) -> &[I] {
71	&self.ranges
72	}
73
74	/// Expand this interval set such that it contains all case folded
75	/// characters. For example, if this class consists of the range `a-z`,
76	/// then applying case folding will result in the class containing both the
77	/// ranges `a-z` and `A-Z`.
78	///
79	/// This returns an error if the necessary case mapping data is not
80	/// available.
81	pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
82	let len = self.ranges.len();
83	for i in `0`..len {
84	let range = self.ranges[i];
85	if let Err(err) = range.case_fold_simple(&mut self.ranges) {
86	self.canonicalize();
87	return Err(err);
88	}
89	}
90	self.canonicalize();
91	Ok(())
92	}
93
94	/// Union this set with the given set, in place.
95	pub fn union(&mut self, other: &IntervalSet<I>) {
96	// This could almost certainly be done more efficiently.
97	self.ranges.extend(&other.ranges);
98	self.canonicalize();
99	}
100
101	/// Intersect this set with the given set, in place.
102	pub fn intersect(&mut self, other: &IntervalSet<I>) {
103	if self.ranges.is_empty() {
104	return;
105	}
106	if other.ranges.is_empty() {
107	self.ranges.clear();
108	return;
109	}
110
111	// There should be a way to do this in-place with constant memory,
112	// but I couldn't figure out a simple way to do it. So just append
113	// the intersection to the end of this range, and then drain it before
114	// we're done.
115	let drain_end = self.ranges.len();
116
117	let mut ita = `0`..drain_end;
118	let mut itb = `0`..other.ranges.len();
119	let mut a = ita.next().unwrap();
120	let mut b = itb.next().unwrap();
121	loop {
122	if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
123	self.ranges.push(ab);
124	}
125	let (it, aorb) =
126	if self.ranges[a].upper() < other.ranges[b].upper() {
127	(&mut ita, &mut a)
128	} else {
129	(&mut itb, &mut b)
130	};
131	match it.next() {
132	Some(v) => *aorb = v,
133	None => break,
134	}
135	}
136	self.ranges.drain(..drain_end);
137	}
138
139	/// Subtract the given set from this set, in place.
140	pub fn difference(&mut self, other: &IntervalSet<I>) {
141	if self.ranges.is_empty() \|\| other.ranges.is_empty() {
142	return;
143	}
144
145	// This algorithm is (to me) surprisingly complex. A search of the
146	// interwebs indicate that this is a potentially interesting problem.
147	// Folks seem to suggest interval or segment trees, but I'd like to
148	// avoid the overhead (both runtime and conceptual) of that.
149	//
150	// The following is basically my Shitty First Draft. Therefore, in
151	// order to grok it, you probably need to read each line carefully.
152	// Simplifications are most welcome!
153	//
154	// Remember, we can assume the canonical format invariant here, which
155	// says that all ranges are sorted, not overlapping and not adjacent in
156	// each class.
157	let drain_end = self.ranges.len();
158	let (mut a, mut b) = (`0`, `0`);
159	'LOOP: while a < drain_end && b < other.ranges.len() {
160	// Basically, the easy cases are when neither range overlaps with
161	// each other. If the `b` range is less than our current `a`
162	// range, then we can skip it and move on.
163	if other.ranges[b].upper() < self.ranges[a].lower() {
164	b += `1`;
165	continue;
166	}
167	// ... similarly for the `a` range. If it's less than the smallest
168	// `b` range, then we can add it as-is.
169	if self.ranges[a].upper() < other.ranges[b].lower() {
170	let range = self.ranges[a];
171	self.ranges.push(range);
172	a += `1`;
173	continue;
174	}
175	// Otherwise, we have overlapping ranges.
176	assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
177
178	// This part is tricky and was non-obvious to me without looking
179	// at explicit examples (see the tests). The trickiness stems from
180	// two things: 1) subtracting a range from another range could
181	// yield two ranges and 2) after subtracting a range, it's possible
182	// that future ranges can have an impact. The loop below advances
183	// the `b` ranges until they can't possible impact the current
184	// range.
185	//
186	// For example, if our `a` range is `a-t` and our next three `b`
187	// ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
188	// subtraction three times before moving on to the next `a` range.
189	let mut range = self.ranges[a];
190	while b < other.ranges.len()
191	&& !range.is_intersection_empty(&other.ranges[b])
192	{
193	let old_range = range;
194	range = match range.difference(&other.ranges[b]) {
195	(None, None) => {
196	// We lost the entire range, so move on to the next
197	// without adding this one.
198	a += `1`;
199	continue 'LOOP;
200	}
201	(Some(range1), None) \| (None, Some(range1)) => range1,
202	(Some(range1), Some(range2)) => {
203	self.ranges.push(range1);
204	range2
205	}
206	};
207	// It's possible that the `b` range has more to contribute
208	// here. In particular, if it is greater than the original
209	// range, then it might impact the next `a` range and* it*
210	// has impacted the current `a` range as much as possible,
211	// so we can quit. We don't bump `b` so that the next `a`
212	// range can apply it.
213	if other.ranges[b].upper() > old_range.upper() {
214	break;
215	}
216	// Otherwise, the next `b` range might apply to the current
217	// `a` range.
218	b += `1`;
219	}
220	self.ranges.push(range);
221	a += `1`;
222	}
223	while a < drain_end {
224	let range = self.ranges[a];
225	self.ranges.push(range);
226	a += `1`;
227	}
228	self.ranges.drain(..drain_end);
229	}
230
231	/// Compute the symmetric difference of the two sets, in place.
232	///
233	/// This computes the symmetric difference of two interval sets. This
234	/// removes all elements in this set that are also in the given set,
235	/// but also adds all elements from the given set that aren't in this
236	/// set. That is, the set will contain all elements in either set,
237	/// but will not contain any elements that are in both sets.
238	pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
239	// TODO(burntsushi): Fix this so that it amortizes allocation.
240	let mut intersection = self.clone();
241	intersection.intersect(other);
242	self.union(other);
243	self.difference(&intersection);
244	}
245
246	/// Negate this interval set.
247	///
248	/// For all `x` where `x` is any element, if `x` was in this set, then it
249	/// will not be in this set after negation.
250	pub fn negate(&mut self) {
251	if self.ranges.is_empty() {
252	let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
253	self.ranges.push(I::create(min, max));
254	return;
255	}
256
257	// There should be a way to do this in-place with constant memory,
258	// but I couldn't figure out a simple way to do it. So just append
259	// the negation to the end of this range, and then drain it before
260	// we're done.
261	let drain_end = self.ranges.len();
262
263	// We do checked arithmetic below because of the canonical ordering
264	// invariant.
265	if self.ranges[`0`].lower() > I::Bound::min_value() {
266	let upper = self.ranges[`0`].lower().decrement();
267	self.ranges.push(I::create(I::Bound::min_value(), upper));
268	}
269	for i in `1`..drain_end {
270	let lower = self.ranges[i - `1`].upper().increment();
271	let upper = self.ranges[i].lower().decrement();
272	self.ranges.push(I::create(lower, upper));
273	}
274	if self.ranges[drain_end - `1`].upper() < I::Bound::max_value() {
275	let lower = self.ranges[drain_end - `1`].upper().increment();
276	self.ranges.push(I::create(lower, I::Bound::max_value()));
277	}
278	self.ranges.drain(..drain_end);
279	}
280
281	/// Converts this set into a canonical ordering.
282	fn canonicalize(&mut self) {
283	if self.is_canonical() {
284	return;
285	}
286	self.ranges.sort();
287	assert!(!self.ranges.is_empty());
288
289	// Is there a way to do this in-place with constant memory? I couldn't
290	// figure out a way to do it. So just append the canonicalization to
291	// the end of this range, and then drain it before we're done.
292	let drain_end = self.ranges.len();
293	for oldi in `0`..drain_end {
294	// If we've added at least one new range, then check if we can
295	// merge this range in the previously added range.
296	if self.ranges.len() > drain_end {
297	let (last, rest) = self.ranges.split_last_mut().unwrap();
298	if let Some(union) = last.union(&rest[oldi]) {
299	*last = union;
300	continue;
301	}
302	}
303	let range = self.ranges[oldi];
304	self.ranges.push(range);
305	}
306	self.ranges.drain(..drain_end);
307	}
308
309	/// Returns true if and only if this class is in a canonical ordering.
310	fn is_canonical(&self) -> bool {
311	for pair in self.ranges.windows(`2`) {
312	if pair[`0`] >= pair[`1`] {
313	return `false`;
314	}
315	if pair[`0`].is_contiguous(&pair[`1`]) {
316	return `false`;
317	}
318	}
319	`true`
320	}
321	}
322
323	/// An iterator over intervals.
324	#[derive(Debug)]
325	pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
326
327	impl<'a, I> Iterator for IntervalSetIter<'a, I> {
328	type Item = &'a I;
329
330	fn next(&mut self) -> Option<&'a I> {
331	self.0.next()
332	}
333	}
334
335	pub trait Interval:
336	Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
337	{
338	type Bound: Bound;
339
340	fn lower(&self) -> Self::Bound;
341	fn upper(&self) -> Self::Bound;
342	fn set_lower(&mut self, bound: Self::Bound);
343	fn set_upper(&mut self, bound: Self::Bound);
344	fn case_fold_simple(
345	&self,
346	intervals: &mut Vec<Self>,
347	) -> Result<(), unicode::CaseFoldError>;
348
349	/// Create a new interval.
350	fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
351	let mut int = Self::default();
352	if lower <= upper {
353	int.set_lower(lower);
354	int.set_upper(upper);
355	} else {
356	int.set_lower(upper);
357	int.set_upper(lower);
358	}
359	int
360	}
361
362	/// Union the given overlapping range into this range.
363	///
364	/// If the two ranges aren't contiguous, then this returns `None`.
365	fn union(&self, other: &Self) -> Option<Self> {
366	if !self.is_contiguous(other) {
367	return None;
368	}
369	let lower = cmp::min(self.lower(), other.lower());
370	let upper = cmp::max(self.upper(), other.upper());
371	Some(Self::create(lower, upper))
372	}
373
374	/// Intersect this range with the given range and return the result.
375	///
376	/// If the intersection is empty, then this returns `None`.
377	fn intersect(&self, other: &Self) -> Option<Self> {
378	let lower = cmp::max(self.lower(), other.lower());
379	let upper = cmp::min(self.upper(), other.upper());
380	if lower <= upper {
381	Some(Self::create(lower, upper))
382	} else {
383	None
384	}
385	}
386
387	/// Subtract the given range from this range and return the resulting
388	/// ranges.
389	///
390	/// If subtraction would result in an empty range, then no ranges are
391	/// returned.
392	fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
393	if self.is_subset(other) {
394	return (None, None);
395	}
396	if self.is_intersection_empty(other) {
397	return (Some(self.clone()), None);
398	}
399	let add_lower = other.lower() > self.lower();
400	let add_upper = other.upper() < self.upper();
401	// We know this because !self.is_subset(other) and the ranges have
402	// a non-empty intersection.
403	assert!(add_lower \|\| add_upper);
404	let mut ret = (None, None);
405	if add_lower {
406	let upper = other.lower().decrement();
407	ret.0 = Some(Self::create(self.lower(), upper));
408	}
409	if add_upper {
410	let lower = other.upper().increment();
411	let range = Self::create(lower, self.upper());
412	if ret.0.is_none() {
413	ret.0 = Some(range);
414	} else {
415	ret.1 = Some(range);
416	}
417	}
418	ret
419	}
420
421	/// Compute the symmetric difference the given range from this range. This
422	/// returns the union of the two ranges minus its intersection.
423	fn symmetric_difference(
424	&self,
425	other: &Self,
426	) -> (Option<Self>, Option<Self>) {
427	let union = match self.union(other) {
428	None => return (Some(self.clone()), Some(other.clone())),
429	Some(union) => union,
430	};
431	let intersection = match self.intersect(other) {
432	None => return (Some(self.clone()), Some(other.clone())),
433	Some(intersection) => intersection,
434	};
435	union.difference(&intersection)
436	}
437
438	/// Returns true if and only if the two ranges are contiguous. Two ranges
439	/// are contiguous if and only if the ranges are either overlapping or
440	/// adjacent.
441	fn is_contiguous(&self, other: &Self) -> bool {
442	let lower1 = self.lower().as_u32();
443	let upper1 = self.upper().as_u32();
444	let lower2 = other.lower().as_u32();
445	let upper2 = other.upper().as_u32();
446	cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(`1`)
447	}
448
449	/// Returns true if and only if the intersection of this range and the
450	/// other range is empty.
451	fn is_intersection_empty(&self, other: &Self) -> bool {
452	let (lower1, upper1) = (self.lower(), self.upper());
453	let (lower2, upper2) = (other.lower(), other.upper());
454	cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
455	}
456
457	/// Returns true if and only if this range is a subset of the other range.
458	fn is_subset(&self, other: &Self) -> bool {
459	let (lower1, upper1) = (self.lower(), self.upper());
460	let (lower2, upper2) = (other.lower(), other.upper());
461	(lower2 <= lower1 && lower1 <= upper2)
462	&& (lower2 <= upper1 && upper1 <= upper2)
463	}
464	}
465
466	pub trait Bound:
467	Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
468	{
469	fn min_value() -> Self;
470	fn max_value() -> Self;
471	fn as_u32(self) -> u32;
472	fn increment(self) -> Self;
473	fn decrement(self) -> Self;
474	}
475
476	impl Bound for u8 {
477	fn min_value() -> Self {
478	u8::MIN
479	}
480	fn max_value() -> Self {
481	u8::MAX
482	}
483	fn as_u32(self) -> u32 {
484	self as u32
485	}
486	fn increment(self) -> Self {
487	self.checked_add(`1`).unwrap()
488	}
489	fn decrement(self) -> Self {
490	self.checked_sub(`1`).unwrap()
491	}
492	}
493
494	impl Bound for char {
495	fn min_value() -> Self {
496	'`\x00`'
497	}
498	fn max_value() -> Self {
499	'`\u{10FFFF}`'
500	}
501	fn as_u32(self) -> u32 {
502	self as u32
503	}
504
505	fn increment(self) -> Self {
506	match self {
507	'`\u{D7FF}`' => '`\u{E000}`',
508	c => char::from_u32((c as u32).checked_add(`1`).unwrap()).unwrap(),
509	}
510	}
511
512	fn decrement(self) -> Self {
513	match self {
514	'`\u{E000}`' => '`\u{D7FF}`',
515	c => char::from_u32((c as u32).checked_sub(`1`).unwrap()).unwrap(),
516	}
517	}
518	}
519
520	// Tests for interval sets are written in src/hir.rs against the public API.
521