1use core::{char, cmp, fmt::Debug, slice};
2
3use alloc::vec::Vec;
4
5use crate::unicode;
6
7// This module contains an *internal* implementation of interval sets.
8//
9// The primary invariant that interval sets guards is canonical ordering. That
10// is, every interval set contains an ordered sequence of intervals where
11// no two intervals are overlapping or adjacent. While this invariant is
12// occasionally broken within the implementation, it should be impossible for
13// callers to observe it.
14//
15// Since case folding (as implemented below) breaks that invariant, we roll
16// that into this API even though it is a little out of place in an otherwise
17// generic interval set. (Hence the reason why the `unicode` module is imported
18// here.)
19//
20// Some of the implementation complexity here is a result of me wanting to
21// preserve the sequential representation without using additional memory.
22// In many cases, we do use linear extra memory, but it is at most 2x and it
23// is amortized. If we relaxed the memory requirements, this implementation
24// could become much simpler. The extra memory is honestly probably OK, but
25// character classes (especially of the Unicode variety) can become quite
26// large, and it would be nice to keep regex compilation snappy even in debug
27// builds. (In the past, I have been careless with this area of code and it has
28// caused slow regex compilations in debug mode, so this isn't entirely
29// unwarranted.)
30//
31// Tests on this are relegated to the public API of HIR in src/hir.rs.
32
33#[derive(Clone, Debug)]
34pub struct IntervalSet<I> {
35 /// A sorted set of non-overlapping ranges.
36 ranges: Vec<I>,
37 /// While not required at all for correctness, we keep track of whether an
38 /// interval set has been case folded or not. This helps us avoid doing
39 /// redundant work if, for example, a set has already been cased folded.
40 /// And note that whether a set is folded or not is preserved through
41 /// all of the pairwise set operations. That is, if both interval sets
42 /// have been case folded, then any of difference, union, intersection or
43 /// symmetric difference all produce a case folded set.
44 ///
45 /// Note that when this is true, it *must* be the case that the set is case
46 /// folded. But when it's false, the set *may* be case folded. In other
47 /// words, we only set this to true when we know it to be case, but we're
48 /// okay with it being false if it would otherwise be costly to determine
49 /// whether it should be true. This means code cannot assume that a false
50 /// value necessarily indicates that the set is not case folded.
51 ///
52 /// Bottom line: this is a performance optimization.
53 folded: bool,
54}
55
56impl<I: Interval> Eq for IntervalSet<I> {}
57
58// We implement PartialEq manually so that we don't consider the set's internal
59// 'folded' property to be part of its identity. The 'folded' property is
60// strictly an optimization.
61impl<I: Interval> PartialEq for IntervalSet<I> {
62 fn eq(&self, other: &IntervalSet<I>) -> bool {
63 self.ranges.eq(&other.ranges)
64 }
65}
66
67impl<I: Interval> IntervalSet<I> {
68 /// Create a new set from a sequence of intervals. Each interval is
69 /// specified as a pair of bounds, where both bounds are inclusive.
70 ///
71 /// The given ranges do not need to be in any specific order, and ranges
72 /// may overlap.
73 pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
74 let ranges: Vec<I> = intervals.into_iter().collect();
75 // An empty set is case folded.
76 let folded = ranges.is_empty();
77 let mut set = IntervalSet { ranges, folded };
78 set.canonicalize();
79 set
80 }
81
82 /// Add a new interval to this set.
83 pub fn push(&mut self, interval: I) {
84 // TODO: This could be faster. e.g., Push the interval such that
85 // it preserves canonicalization.
86 self.ranges.push(interval);
87 self.canonicalize();
88 // We don't know whether the new interval added here is considered
89 // case folded, so we conservatively assume that the entire set is
90 // no longer case folded if it was previously.
91 self.folded = false;
92 }
93
94 /// Return an iterator over all intervals in this set.
95 ///
96 /// The iterator yields intervals in ascending order.
97 pub fn iter(&self) -> IntervalSetIter<'_, I> {
98 IntervalSetIter(self.ranges.iter())
99 }
100
101 /// Return an immutable slice of intervals in this set.
102 ///
103 /// The sequence returned is in canonical ordering.
104 pub fn intervals(&self) -> &[I] {
105 &self.ranges
106 }
107
108 /// Expand this interval set such that it contains all case folded
109 /// characters. For example, if this class consists of the range `a-z`,
110 /// then applying case folding will result in the class containing both the
111 /// ranges `a-z` and `A-Z`.
112 ///
113 /// This returns an error if the necessary case mapping data is not
114 /// available.
115 pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
116 if self.folded {
117 return Ok(());
118 }
119 let len = self.ranges.len();
120 for i in 0..len {
121 let range = self.ranges[i];
122 if let Err(err) = range.case_fold_simple(&mut self.ranges) {
123 self.canonicalize();
124 return Err(err);
125 }
126 }
127 self.canonicalize();
128 self.folded = true;
129 Ok(())
130 }
131
132 /// Union this set with the given set, in place.
133 pub fn union(&mut self, other: &IntervalSet<I>) {
134 if other.ranges.is_empty() || self.ranges == other.ranges {
135 return;
136 }
137 // This could almost certainly be done more efficiently.
138 self.ranges.extend(&other.ranges);
139 self.canonicalize();
140 self.folded = self.folded && other.folded;
141 }
142
143 /// Intersect this set with the given set, in place.
144 pub fn intersect(&mut self, other: &IntervalSet<I>) {
145 if self.ranges.is_empty() {
146 return;
147 }
148 if other.ranges.is_empty() {
149 self.ranges.clear();
150 // An empty set is case folded.
151 self.folded = true;
152 return;
153 }
154
155 // There should be a way to do this in-place with constant memory,
156 // but I couldn't figure out a simple way to do it. So just append
157 // the intersection to the end of this range, and then drain it before
158 // we're done.
159 let drain_end = self.ranges.len();
160
161 let mut ita = 0..drain_end;
162 let mut itb = 0..other.ranges.len();
163 let mut a = ita.next().unwrap();
164 let mut b = itb.next().unwrap();
165 loop {
166 if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
167 self.ranges.push(ab);
168 }
169 let (it, aorb) =
170 if self.ranges[a].upper() < other.ranges[b].upper() {
171 (&mut ita, &mut a)
172 } else {
173 (&mut itb, &mut b)
174 };
175 match it.next() {
176 Some(v) => *aorb = v,
177 None => break,
178 }
179 }
180 self.ranges.drain(..drain_end);
181 self.folded = self.folded && other.folded;
182 }
183
184 /// Subtract the given set from this set, in place.
185 pub fn difference(&mut self, other: &IntervalSet<I>) {
186 if self.ranges.is_empty() || other.ranges.is_empty() {
187 return;
188 }
189
190 // This algorithm is (to me) surprisingly complex. A search of the
191 // interwebs indicate that this is a potentially interesting problem.
192 // Folks seem to suggest interval or segment trees, but I'd like to
193 // avoid the overhead (both runtime and conceptual) of that.
194 //
195 // The following is basically my Shitty First Draft. Therefore, in
196 // order to grok it, you probably need to read each line carefully.
197 // Simplifications are most welcome!
198 //
199 // Remember, we can assume the canonical format invariant here, which
200 // says that all ranges are sorted, not overlapping and not adjacent in
201 // each class.
202 let drain_end = self.ranges.len();
203 let (mut a, mut b) = (0, 0);
204 'LOOP: while a < drain_end && b < other.ranges.len() {
205 // Basically, the easy cases are when neither range overlaps with
206 // each other. If the `b` range is less than our current `a`
207 // range, then we can skip it and move on.
208 if other.ranges[b].upper() < self.ranges[a].lower() {
209 b += 1;
210 continue;
211 }
212 // ... similarly for the `a` range. If it's less than the smallest
213 // `b` range, then we can add it as-is.
214 if self.ranges[a].upper() < other.ranges[b].lower() {
215 let range = self.ranges[a];
216 self.ranges.push(range);
217 a += 1;
218 continue;
219 }
220 // Otherwise, we have overlapping ranges.
221 assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
222
223 // This part is tricky and was non-obvious to me without looking
224 // at explicit examples (see the tests). The trickiness stems from
225 // two things: 1) subtracting a range from another range could
226 // yield two ranges and 2) after subtracting a range, it's possible
227 // that future ranges can have an impact. The loop below advances
228 // the `b` ranges until they can't possible impact the current
229 // range.
230 //
231 // For example, if our `a` range is `a-t` and our next three `b`
232 // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
233 // subtraction three times before moving on to the next `a` range.
234 let mut range = self.ranges[a];
235 while b < other.ranges.len()
236 && !range.is_intersection_empty(&other.ranges[b])
237 {
238 let old_range = range;
239 range = match range.difference(&other.ranges[b]) {
240 (None, None) => {
241 // We lost the entire range, so move on to the next
242 // without adding this one.
243 a += 1;
244 continue 'LOOP;
245 }
246 (Some(range1), None) | (None, Some(range1)) => range1,
247 (Some(range1), Some(range2)) => {
248 self.ranges.push(range1);
249 range2
250 }
251 };
252 // It's possible that the `b` range has more to contribute
253 // here. In particular, if it is greater than the original
254 // range, then it might impact the next `a` range *and* it
255 // has impacted the current `a` range as much as possible,
256 // so we can quit. We don't bump `b` so that the next `a`
257 // range can apply it.
258 if other.ranges[b].upper() > old_range.upper() {
259 break;
260 }
261 // Otherwise, the next `b` range might apply to the current
262 // `a` range.
263 b += 1;
264 }
265 self.ranges.push(range);
266 a += 1;
267 }
268 while a < drain_end {
269 let range = self.ranges[a];
270 self.ranges.push(range);
271 a += 1;
272 }
273 self.ranges.drain(..drain_end);
274 self.folded = self.folded && other.folded;
275 }
276
277 /// Compute the symmetric difference of the two sets, in place.
278 ///
279 /// This computes the symmetric difference of two interval sets. This
280 /// removes all elements in this set that are also in the given set,
281 /// but also adds all elements from the given set that aren't in this
282 /// set. That is, the set will contain all elements in either set,
283 /// but will not contain any elements that are in both sets.
284 pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
285 // TODO(burntsushi): Fix this so that it amortizes allocation.
286 let mut intersection = self.clone();
287 intersection.intersect(other);
288 self.union(other);
289 self.difference(&intersection);
290 }
291
292 /// Negate this interval set.
293 ///
294 /// For all `x` where `x` is any element, if `x` was in this set, then it
295 /// will not be in this set after negation.
296 pub fn negate(&mut self) {
297 if self.ranges.is_empty() {
298 let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
299 self.ranges.push(I::create(min, max));
300 // The set containing everything must case folded.
301 self.folded = true;
302 return;
303 }
304
305 // There should be a way to do this in-place with constant memory,
306 // but I couldn't figure out a simple way to do it. So just append
307 // the negation to the end of this range, and then drain it before
308 // we're done.
309 let drain_end = self.ranges.len();
310
311 // We do checked arithmetic below because of the canonical ordering
312 // invariant.
313 if self.ranges[0].lower() > I::Bound::min_value() {
314 let upper = self.ranges[0].lower().decrement();
315 self.ranges.push(I::create(I::Bound::min_value(), upper));
316 }
317 for i in 1..drain_end {
318 let lower = self.ranges[i - 1].upper().increment();
319 let upper = self.ranges[i].lower().decrement();
320 self.ranges.push(I::create(lower, upper));
321 }
322 if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
323 let lower = self.ranges[drain_end - 1].upper().increment();
324 self.ranges.push(I::create(lower, I::Bound::max_value()));
325 }
326 self.ranges.drain(..drain_end);
327 // We don't need to update whether this set is folded or not, because
328 // it is conservatively preserved through negation. Namely, if a set
329 // is not folded, then it is possible that its negation is folded, for
330 // example, [^☃]. But we're fine with assuming that the set is not
331 // folded in that case. (`folded` permits false negatives but not false
332 // positives.)
333 //
334 // But what about when a set is folded, is its negation also
335 // necessarily folded? Yes. Because if a set is folded, then for every
336 // character in the set, it necessarily included its equivalence class
337 // of case folded characters. Negating it in turn means that all
338 // equivalence classes in the set are negated, and any equivalence
339 // class that was previously not in the set is now entirely in the set.
340 }
341
342 /// Converts this set into a canonical ordering.
343 fn canonicalize(&mut self) {
344 if self.is_canonical() {
345 return;
346 }
347 self.ranges.sort();
348 assert!(!self.ranges.is_empty());
349
350 // Is there a way to do this in-place with constant memory? I couldn't
351 // figure out a way to do it. So just append the canonicalization to
352 // the end of this range, and then drain it before we're done.
353 let drain_end = self.ranges.len();
354 for oldi in 0..drain_end {
355 // If we've added at least one new range, then check if we can
356 // merge this range in the previously added range.
357 if self.ranges.len() > drain_end {
358 let (last, rest) = self.ranges.split_last_mut().unwrap();
359 if let Some(union) = last.union(&rest[oldi]) {
360 *last = union;
361 continue;
362 }
363 }
364 let range = self.ranges[oldi];
365 self.ranges.push(range);
366 }
367 self.ranges.drain(..drain_end);
368 }
369
370 /// Returns true if and only if this class is in a canonical ordering.
371 fn is_canonical(&self) -> bool {
372 for pair in self.ranges.windows(2) {
373 if pair[0] >= pair[1] {
374 return false;
375 }
376 if pair[0].is_contiguous(&pair[1]) {
377 return false;
378 }
379 }
380 true
381 }
382}
383
384/// An iterator over intervals.
385#[derive(Debug)]
386pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
387
388impl<'a, I> Iterator for IntervalSetIter<'a, I> {
389 type Item = &'a I;
390
391 fn next(&mut self) -> Option<&'a I> {
392 self.0.next()
393 }
394}
395
396pub trait Interval:
397 Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
398{
399 type Bound: Bound;
400
401 fn lower(&self) -> Self::Bound;
402 fn upper(&self) -> Self::Bound;
403 fn set_lower(&mut self, bound: Self::Bound);
404 fn set_upper(&mut self, bound: Self::Bound);
405 fn case_fold_simple(
406 &self,
407 intervals: &mut Vec<Self>,
408 ) -> Result<(), unicode::CaseFoldError>;
409
410 /// Create a new interval.
411 fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
412 let mut int = Self::default();
413 if lower <= upper {
414 int.set_lower(lower);
415 int.set_upper(upper);
416 } else {
417 int.set_lower(upper);
418 int.set_upper(lower);
419 }
420 int
421 }
422
423 /// Union the given overlapping range into this range.
424 ///
425 /// If the two ranges aren't contiguous, then this returns `None`.
426 fn union(&self, other: &Self) -> Option<Self> {
427 if !self.is_contiguous(other) {
428 return None;
429 }
430 let lower = cmp::min(self.lower(), other.lower());
431 let upper = cmp::max(self.upper(), other.upper());
432 Some(Self::create(lower, upper))
433 }
434
435 /// Intersect this range with the given range and return the result.
436 ///
437 /// If the intersection is empty, then this returns `None`.
438 fn intersect(&self, other: &Self) -> Option<Self> {
439 let lower = cmp::max(self.lower(), other.lower());
440 let upper = cmp::min(self.upper(), other.upper());
441 if lower <= upper {
442 Some(Self::create(lower, upper))
443 } else {
444 None
445 }
446 }
447
448 /// Subtract the given range from this range and return the resulting
449 /// ranges.
450 ///
451 /// If subtraction would result in an empty range, then no ranges are
452 /// returned.
453 fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
454 if self.is_subset(other) {
455 return (None, None);
456 }
457 if self.is_intersection_empty(other) {
458 return (Some(self.clone()), None);
459 }
460 let add_lower = other.lower() > self.lower();
461 let add_upper = other.upper() < self.upper();
462 // We know this because !self.is_subset(other) and the ranges have
463 // a non-empty intersection.
464 assert!(add_lower || add_upper);
465 let mut ret = (None, None);
466 if add_lower {
467 let upper = other.lower().decrement();
468 ret.0 = Some(Self::create(self.lower(), upper));
469 }
470 if add_upper {
471 let lower = other.upper().increment();
472 let range = Self::create(lower, self.upper());
473 if ret.0.is_none() {
474 ret.0 = Some(range);
475 } else {
476 ret.1 = Some(range);
477 }
478 }
479 ret
480 }
481
482 /// Compute the symmetric difference the given range from this range. This
483 /// returns the union of the two ranges minus its intersection.
484 fn symmetric_difference(
485 &self,
486 other: &Self,
487 ) -> (Option<Self>, Option<Self>) {
488 let union = match self.union(other) {
489 None => return (Some(self.clone()), Some(other.clone())),
490 Some(union) => union,
491 };
492 let intersection = match self.intersect(other) {
493 None => return (Some(self.clone()), Some(other.clone())),
494 Some(intersection) => intersection,
495 };
496 union.difference(&intersection)
497 }
498
499 /// Returns true if and only if the two ranges are contiguous. Two ranges
500 /// are contiguous if and only if the ranges are either overlapping or
501 /// adjacent.
502 fn is_contiguous(&self, other: &Self) -> bool {
503 let lower1 = self.lower().as_u32();
504 let upper1 = self.upper().as_u32();
505 let lower2 = other.lower().as_u32();
506 let upper2 = other.upper().as_u32();
507 cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1)
508 }
509
510 /// Returns true if and only if the intersection of this range and the
511 /// other range is empty.
512 fn is_intersection_empty(&self, other: &Self) -> bool {
513 let (lower1, upper1) = (self.lower(), self.upper());
514 let (lower2, upper2) = (other.lower(), other.upper());
515 cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
516 }
517
518 /// Returns true if and only if this range is a subset of the other range.
519 fn is_subset(&self, other: &Self) -> bool {
520 let (lower1, upper1) = (self.lower(), self.upper());
521 let (lower2, upper2) = (other.lower(), other.upper());
522 (lower2 <= lower1 && lower1 <= upper2)
523 && (lower2 <= upper1 && upper1 <= upper2)
524 }
525}
526
527pub trait Bound:
528 Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
529{
530 fn min_value() -> Self;
531 fn max_value() -> Self;
532 fn as_u32(self) -> u32;
533 fn increment(self) -> Self;
534 fn decrement(self) -> Self;
535}
536
537impl Bound for u8 {
538 fn min_value() -> Self {
539 u8::MIN
540 }
541 fn max_value() -> Self {
542 u8::MAX
543 }
544 fn as_u32(self) -> u32 {
545 u32::from(self)
546 }
547 fn increment(self) -> Self {
548 self.checked_add(1).unwrap()
549 }
550 fn decrement(self) -> Self {
551 self.checked_sub(1).unwrap()
552 }
553}
554
555impl Bound for char {
556 fn min_value() -> Self {
557 '\x00'
558 }
559 fn max_value() -> Self {
560 '\u{10FFFF}'
561 }
562 fn as_u32(self) -> u32 {
563 u32::from(self)
564 }
565
566 fn increment(self) -> Self {
567 match self {
568 '\u{D7FF}' => '\u{E000}',
569 c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(),
570 }
571 }
572
573 fn decrement(self) -> Self {
574 match self {
575 '\u{E000}' => '\u{D7FF}',
576 c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(),
577 }
578 }
579}
580
581// Tests for interval sets are written in src/hir.rs against the public API.
582