1 | use core::{char, cmp, fmt::Debug, slice}; |
2 | |
3 | use alloc::vec::Vec; |
4 | |
5 | use crate::unicode; |
6 | |
7 | // This module contains an *internal* implementation of interval sets. |
8 | // |
9 | // The primary invariant that interval sets guards is canonical ordering. That |
10 | // is, every interval set contains an ordered sequence of intervals where |
11 | // no two intervals are overlapping or adjacent. While this invariant is |
12 | // occasionally broken within the implementation, it should be impossible for |
13 | // callers to observe it. |
14 | // |
15 | // Since case folding (as implemented below) breaks that invariant, we roll |
16 | // that into this API even though it is a little out of place in an otherwise |
17 | // generic interval set. (Hence the reason why the `unicode` module is imported |
18 | // here.) |
19 | // |
20 | // Some of the implementation complexity here is a result of me wanting to |
21 | // preserve the sequential representation without using additional memory. |
22 | // In many cases, we do use linear extra memory, but it is at most 2x and it |
23 | // is amortized. If we relaxed the memory requirements, this implementation |
24 | // could become much simpler. The extra memory is honestly probably OK, but |
25 | // character classes (especially of the Unicode variety) can become quite |
26 | // large, and it would be nice to keep regex compilation snappy even in debug |
27 | // builds. (In the past, I have been careless with this area of code and it has |
28 | // caused slow regex compilations in debug mode, so this isn't entirely |
29 | // unwarranted.) |
30 | // |
31 | // Tests on this are relegated to the public API of HIR in src/hir.rs. |
32 | |
33 | #[derive (Clone, Debug)] |
34 | pub struct IntervalSet<I> { |
35 | /// A sorted set of non-overlapping ranges. |
36 | ranges: Vec<I>, |
37 | /// While not required at all for correctness, we keep track of whether an |
38 | /// interval set has been case folded or not. This helps us avoid doing |
39 | /// redundant work if, for example, a set has already been cased folded. |
40 | /// And note that whether a set is folded or not is preserved through |
41 | /// all of the pairwise set operations. That is, if both interval sets |
42 | /// have been case folded, then any of difference, union, intersection or |
43 | /// symmetric difference all produce a case folded set. |
44 | /// |
45 | /// Note that when this is true, it *must* be the case that the set is case |
46 | /// folded. But when it's false, the set *may* be case folded. In other |
47 | /// words, we only set this to true when we know it to be case, but we're |
48 | /// okay with it being false if it would otherwise be costly to determine |
49 | /// whether it should be true. This means code cannot assume that a false |
50 | /// value necessarily indicates that the set is not case folded. |
51 | /// |
52 | /// Bottom line: this is a performance optimization. |
53 | folded: bool, |
54 | } |
55 | |
56 | impl<I: Interval> Eq for IntervalSet<I> {} |
57 | |
58 | // We implement PartialEq manually so that we don't consider the set's internal |
59 | // 'folded' property to be part of its identity. The 'folded' property is |
60 | // strictly an optimization. |
61 | impl<I: Interval> PartialEq for IntervalSet<I> { |
62 | fn eq(&self, other: &IntervalSet<I>) -> bool { |
63 | self.ranges.eq(&other.ranges) |
64 | } |
65 | } |
66 | |
67 | impl<I: Interval> IntervalSet<I> { |
68 | /// Create a new set from a sequence of intervals. Each interval is |
69 | /// specified as a pair of bounds, where both bounds are inclusive. |
70 | /// |
71 | /// The given ranges do not need to be in any specific order, and ranges |
72 | /// may overlap. |
73 | pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> { |
74 | let ranges: Vec<I> = intervals.into_iter().collect(); |
75 | // An empty set is case folded. |
76 | let folded = ranges.is_empty(); |
77 | let mut set = IntervalSet { ranges, folded }; |
78 | set.canonicalize(); |
79 | set |
80 | } |
81 | |
82 | /// Add a new interval to this set. |
83 | pub fn push(&mut self, interval: I) { |
84 | // TODO: This could be faster. e.g., Push the interval such that |
85 | // it preserves canonicalization. |
86 | self.ranges.push(interval); |
87 | self.canonicalize(); |
88 | // We don't know whether the new interval added here is considered |
89 | // case folded, so we conservatively assume that the entire set is |
90 | // no longer case folded if it was previously. |
91 | self.folded = false; |
92 | } |
93 | |
94 | /// Return an iterator over all intervals in this set. |
95 | /// |
96 | /// The iterator yields intervals in ascending order. |
97 | pub fn iter(&self) -> IntervalSetIter<'_, I> { |
98 | IntervalSetIter(self.ranges.iter()) |
99 | } |
100 | |
101 | /// Return an immutable slice of intervals in this set. |
102 | /// |
103 | /// The sequence returned is in canonical ordering. |
104 | pub fn intervals(&self) -> &[I] { |
105 | &self.ranges |
106 | } |
107 | |
108 | /// Expand this interval set such that it contains all case folded |
109 | /// characters. For example, if this class consists of the range `a-z`, |
110 | /// then applying case folding will result in the class containing both the |
111 | /// ranges `a-z` and `A-Z`. |
112 | /// |
113 | /// This returns an error if the necessary case mapping data is not |
114 | /// available. |
115 | pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> { |
116 | if self.folded { |
117 | return Ok(()); |
118 | } |
119 | let len = self.ranges.len(); |
120 | for i in 0..len { |
121 | let range = self.ranges[i]; |
122 | if let Err(err) = range.case_fold_simple(&mut self.ranges) { |
123 | self.canonicalize(); |
124 | return Err(err); |
125 | } |
126 | } |
127 | self.canonicalize(); |
128 | self.folded = true; |
129 | Ok(()) |
130 | } |
131 | |
132 | /// Union this set with the given set, in place. |
133 | pub fn union(&mut self, other: &IntervalSet<I>) { |
134 | if other.ranges.is_empty() || self.ranges == other.ranges { |
135 | return; |
136 | } |
137 | // This could almost certainly be done more efficiently. |
138 | self.ranges.extend(&other.ranges); |
139 | self.canonicalize(); |
140 | self.folded = self.folded && other.folded; |
141 | } |
142 | |
143 | /// Intersect this set with the given set, in place. |
144 | pub fn intersect(&mut self, other: &IntervalSet<I>) { |
145 | if self.ranges.is_empty() { |
146 | return; |
147 | } |
148 | if other.ranges.is_empty() { |
149 | self.ranges.clear(); |
150 | // An empty set is case folded. |
151 | self.folded = true; |
152 | return; |
153 | } |
154 | |
155 | // There should be a way to do this in-place with constant memory, |
156 | // but I couldn't figure out a simple way to do it. So just append |
157 | // the intersection to the end of this range, and then drain it before |
158 | // we're done. |
159 | let drain_end = self.ranges.len(); |
160 | |
161 | let mut ita = 0..drain_end; |
162 | let mut itb = 0..other.ranges.len(); |
163 | let mut a = ita.next().unwrap(); |
164 | let mut b = itb.next().unwrap(); |
165 | loop { |
166 | if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) { |
167 | self.ranges.push(ab); |
168 | } |
169 | let (it, aorb) = |
170 | if self.ranges[a].upper() < other.ranges[b].upper() { |
171 | (&mut ita, &mut a) |
172 | } else { |
173 | (&mut itb, &mut b) |
174 | }; |
175 | match it.next() { |
176 | Some(v) => *aorb = v, |
177 | None => break, |
178 | } |
179 | } |
180 | self.ranges.drain(..drain_end); |
181 | self.folded = self.folded && other.folded; |
182 | } |
183 | |
184 | /// Subtract the given set from this set, in place. |
185 | pub fn difference(&mut self, other: &IntervalSet<I>) { |
186 | if self.ranges.is_empty() || other.ranges.is_empty() { |
187 | return; |
188 | } |
189 | |
190 | // This algorithm is (to me) surprisingly complex. A search of the |
191 | // interwebs indicate that this is a potentially interesting problem. |
192 | // Folks seem to suggest interval or segment trees, but I'd like to |
193 | // avoid the overhead (both runtime and conceptual) of that. |
194 | // |
195 | // The following is basically my Shitty First Draft. Therefore, in |
196 | // order to grok it, you probably need to read each line carefully. |
197 | // Simplifications are most welcome! |
198 | // |
199 | // Remember, we can assume the canonical format invariant here, which |
200 | // says that all ranges are sorted, not overlapping and not adjacent in |
201 | // each class. |
202 | let drain_end = self.ranges.len(); |
203 | let (mut a, mut b) = (0, 0); |
204 | 'LOOP: while a < drain_end && b < other.ranges.len() { |
205 | // Basically, the easy cases are when neither range overlaps with |
206 | // each other. If the `b` range is less than our current `a` |
207 | // range, then we can skip it and move on. |
208 | if other.ranges[b].upper() < self.ranges[a].lower() { |
209 | b += 1; |
210 | continue; |
211 | } |
212 | // ... similarly for the `a` range. If it's less than the smallest |
213 | // `b` range, then we can add it as-is. |
214 | if self.ranges[a].upper() < other.ranges[b].lower() { |
215 | let range = self.ranges[a]; |
216 | self.ranges.push(range); |
217 | a += 1; |
218 | continue; |
219 | } |
220 | // Otherwise, we have overlapping ranges. |
221 | assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); |
222 | |
223 | // This part is tricky and was non-obvious to me without looking |
224 | // at explicit examples (see the tests). The trickiness stems from |
225 | // two things: 1) subtracting a range from another range could |
226 | // yield two ranges and 2) after subtracting a range, it's possible |
227 | // that future ranges can have an impact. The loop below advances |
228 | // the `b` ranges until they can't possible impact the current |
229 | // range. |
230 | // |
231 | // For example, if our `a` range is `a-t` and our next three `b` |
232 | // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply |
233 | // subtraction three times before moving on to the next `a` range. |
234 | let mut range = self.ranges[a]; |
235 | while b < other.ranges.len() |
236 | && !range.is_intersection_empty(&other.ranges[b]) |
237 | { |
238 | let old_range = range; |
239 | range = match range.difference(&other.ranges[b]) { |
240 | (None, None) => { |
241 | // We lost the entire range, so move on to the next |
242 | // without adding this one. |
243 | a += 1; |
244 | continue 'LOOP; |
245 | } |
246 | (Some(range1), None) | (None, Some(range1)) => range1, |
247 | (Some(range1), Some(range2)) => { |
248 | self.ranges.push(range1); |
249 | range2 |
250 | } |
251 | }; |
252 | // It's possible that the `b` range has more to contribute |
253 | // here. In particular, if it is greater than the original |
254 | // range, then it might impact the next `a` range *and* it |
255 | // has impacted the current `a` range as much as possible, |
256 | // so we can quit. We don't bump `b` so that the next `a` |
257 | // range can apply it. |
258 | if other.ranges[b].upper() > old_range.upper() { |
259 | break; |
260 | } |
261 | // Otherwise, the next `b` range might apply to the current |
262 | // `a` range. |
263 | b += 1; |
264 | } |
265 | self.ranges.push(range); |
266 | a += 1; |
267 | } |
268 | while a < drain_end { |
269 | let range = self.ranges[a]; |
270 | self.ranges.push(range); |
271 | a += 1; |
272 | } |
273 | self.ranges.drain(..drain_end); |
274 | self.folded = self.folded && other.folded; |
275 | } |
276 | |
277 | /// Compute the symmetric difference of the two sets, in place. |
278 | /// |
279 | /// This computes the symmetric difference of two interval sets. This |
280 | /// removes all elements in this set that are also in the given set, |
281 | /// but also adds all elements from the given set that aren't in this |
282 | /// set. That is, the set will contain all elements in either set, |
283 | /// but will not contain any elements that are in both sets. |
284 | pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) { |
285 | // TODO(burntsushi): Fix this so that it amortizes allocation. |
286 | let mut intersection = self.clone(); |
287 | intersection.intersect(other); |
288 | self.union(other); |
289 | self.difference(&intersection); |
290 | } |
291 | |
292 | /// Negate this interval set. |
293 | /// |
294 | /// For all `x` where `x` is any element, if `x` was in this set, then it |
295 | /// will not be in this set after negation. |
296 | pub fn negate(&mut self) { |
297 | if self.ranges.is_empty() { |
298 | let (min, max) = (I::Bound::min_value(), I::Bound::max_value()); |
299 | self.ranges.push(I::create(min, max)); |
300 | // The set containing everything must case folded. |
301 | self.folded = true; |
302 | return; |
303 | } |
304 | |
305 | // There should be a way to do this in-place with constant memory, |
306 | // but I couldn't figure out a simple way to do it. So just append |
307 | // the negation to the end of this range, and then drain it before |
308 | // we're done. |
309 | let drain_end = self.ranges.len(); |
310 | |
311 | // We do checked arithmetic below because of the canonical ordering |
312 | // invariant. |
313 | if self.ranges[0].lower() > I::Bound::min_value() { |
314 | let upper = self.ranges[0].lower().decrement(); |
315 | self.ranges.push(I::create(I::Bound::min_value(), upper)); |
316 | } |
317 | for i in 1..drain_end { |
318 | let lower = self.ranges[i - 1].upper().increment(); |
319 | let upper = self.ranges[i].lower().decrement(); |
320 | self.ranges.push(I::create(lower, upper)); |
321 | } |
322 | if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { |
323 | let lower = self.ranges[drain_end - 1].upper().increment(); |
324 | self.ranges.push(I::create(lower, I::Bound::max_value())); |
325 | } |
326 | self.ranges.drain(..drain_end); |
327 | // We don't need to update whether this set is folded or not, because |
328 | // it is conservatively preserved through negation. Namely, if a set |
329 | // is not folded, then it is possible that its negation is folded, for |
330 | // example, [^☃]. But we're fine with assuming that the set is not |
331 | // folded in that case. (`folded` permits false negatives but not false |
332 | // positives.) |
333 | // |
334 | // But what about when a set is folded, is its negation also |
335 | // necessarily folded? Yes. Because if a set is folded, then for every |
336 | // character in the set, it necessarily included its equivalence class |
337 | // of case folded characters. Negating it in turn means that all |
338 | // equivalence classes in the set are negated, and any equivalence |
339 | // class that was previously not in the set is now entirely in the set. |
340 | } |
341 | |
342 | /// Converts this set into a canonical ordering. |
343 | fn canonicalize(&mut self) { |
344 | if self.is_canonical() { |
345 | return; |
346 | } |
347 | self.ranges.sort(); |
348 | assert!(!self.ranges.is_empty()); |
349 | |
350 | // Is there a way to do this in-place with constant memory? I couldn't |
351 | // figure out a way to do it. So just append the canonicalization to |
352 | // the end of this range, and then drain it before we're done. |
353 | let drain_end = self.ranges.len(); |
354 | for oldi in 0..drain_end { |
355 | // If we've added at least one new range, then check if we can |
356 | // merge this range in the previously added range. |
357 | if self.ranges.len() > drain_end { |
358 | let (last, rest) = self.ranges.split_last_mut().unwrap(); |
359 | if let Some(union) = last.union(&rest[oldi]) { |
360 | *last = union; |
361 | continue; |
362 | } |
363 | } |
364 | let range = self.ranges[oldi]; |
365 | self.ranges.push(range); |
366 | } |
367 | self.ranges.drain(..drain_end); |
368 | } |
369 | |
370 | /// Returns true if and only if this class is in a canonical ordering. |
371 | fn is_canonical(&self) -> bool { |
372 | for pair in self.ranges.windows(2) { |
373 | if pair[0] >= pair[1] { |
374 | return false; |
375 | } |
376 | if pair[0].is_contiguous(&pair[1]) { |
377 | return false; |
378 | } |
379 | } |
380 | true |
381 | } |
382 | } |
383 | |
384 | /// An iterator over intervals. |
385 | #[derive (Debug)] |
386 | pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>); |
387 | |
388 | impl<'a, I> Iterator for IntervalSetIter<'a, I> { |
389 | type Item = &'a I; |
390 | |
391 | fn next(&mut self) -> Option<&'a I> { |
392 | self.0.next() |
393 | } |
394 | } |
395 | |
396 | pub trait Interval: |
397 | Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord |
398 | { |
399 | type Bound: Bound; |
400 | |
401 | fn lower(&self) -> Self::Bound; |
402 | fn upper(&self) -> Self::Bound; |
403 | fn set_lower(&mut self, bound: Self::Bound); |
404 | fn set_upper(&mut self, bound: Self::Bound); |
405 | fn case_fold_simple( |
406 | &self, |
407 | intervals: &mut Vec<Self>, |
408 | ) -> Result<(), unicode::CaseFoldError>; |
409 | |
410 | /// Create a new interval. |
411 | fn create(lower: Self::Bound, upper: Self::Bound) -> Self { |
412 | let mut int = Self::default(); |
413 | if lower <= upper { |
414 | int.set_lower(lower); |
415 | int.set_upper(upper); |
416 | } else { |
417 | int.set_lower(upper); |
418 | int.set_upper(lower); |
419 | } |
420 | int |
421 | } |
422 | |
423 | /// Union the given overlapping range into this range. |
424 | /// |
425 | /// If the two ranges aren't contiguous, then this returns `None`. |
426 | fn union(&self, other: &Self) -> Option<Self> { |
427 | if !self.is_contiguous(other) { |
428 | return None; |
429 | } |
430 | let lower = cmp::min(self.lower(), other.lower()); |
431 | let upper = cmp::max(self.upper(), other.upper()); |
432 | Some(Self::create(lower, upper)) |
433 | } |
434 | |
435 | /// Intersect this range with the given range and return the result. |
436 | /// |
437 | /// If the intersection is empty, then this returns `None`. |
438 | fn intersect(&self, other: &Self) -> Option<Self> { |
439 | let lower = cmp::max(self.lower(), other.lower()); |
440 | let upper = cmp::min(self.upper(), other.upper()); |
441 | if lower <= upper { |
442 | Some(Self::create(lower, upper)) |
443 | } else { |
444 | None |
445 | } |
446 | } |
447 | |
448 | /// Subtract the given range from this range and return the resulting |
449 | /// ranges. |
450 | /// |
451 | /// If subtraction would result in an empty range, then no ranges are |
452 | /// returned. |
453 | fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) { |
454 | if self.is_subset(other) { |
455 | return (None, None); |
456 | } |
457 | if self.is_intersection_empty(other) { |
458 | return (Some(self.clone()), None); |
459 | } |
460 | let add_lower = other.lower() > self.lower(); |
461 | let add_upper = other.upper() < self.upper(); |
462 | // We know this because !self.is_subset(other) and the ranges have |
463 | // a non-empty intersection. |
464 | assert!(add_lower || add_upper); |
465 | let mut ret = (None, None); |
466 | if add_lower { |
467 | let upper = other.lower().decrement(); |
468 | ret.0 = Some(Self::create(self.lower(), upper)); |
469 | } |
470 | if add_upper { |
471 | let lower = other.upper().increment(); |
472 | let range = Self::create(lower, self.upper()); |
473 | if ret.0.is_none() { |
474 | ret.0 = Some(range); |
475 | } else { |
476 | ret.1 = Some(range); |
477 | } |
478 | } |
479 | ret |
480 | } |
481 | |
482 | /// Compute the symmetric difference the given range from this range. This |
483 | /// returns the union of the two ranges minus its intersection. |
484 | fn symmetric_difference( |
485 | &self, |
486 | other: &Self, |
487 | ) -> (Option<Self>, Option<Self>) { |
488 | let union = match self.union(other) { |
489 | None => return (Some(self.clone()), Some(other.clone())), |
490 | Some(union) => union, |
491 | }; |
492 | let intersection = match self.intersect(other) { |
493 | None => return (Some(self.clone()), Some(other.clone())), |
494 | Some(intersection) => intersection, |
495 | }; |
496 | union.difference(&intersection) |
497 | } |
498 | |
499 | /// Returns true if and only if the two ranges are contiguous. Two ranges |
500 | /// are contiguous if and only if the ranges are either overlapping or |
501 | /// adjacent. |
502 | fn is_contiguous(&self, other: &Self) -> bool { |
503 | let lower1 = self.lower().as_u32(); |
504 | let upper1 = self.upper().as_u32(); |
505 | let lower2 = other.lower().as_u32(); |
506 | let upper2 = other.upper().as_u32(); |
507 | cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1) |
508 | } |
509 | |
510 | /// Returns true if and only if the intersection of this range and the |
511 | /// other range is empty. |
512 | fn is_intersection_empty(&self, other: &Self) -> bool { |
513 | let (lower1, upper1) = (self.lower(), self.upper()); |
514 | let (lower2, upper2) = (other.lower(), other.upper()); |
515 | cmp::max(lower1, lower2) > cmp::min(upper1, upper2) |
516 | } |
517 | |
518 | /// Returns true if and only if this range is a subset of the other range. |
519 | fn is_subset(&self, other: &Self) -> bool { |
520 | let (lower1, upper1) = (self.lower(), self.upper()); |
521 | let (lower2, upper2) = (other.lower(), other.upper()); |
522 | (lower2 <= lower1 && lower1 <= upper2) |
523 | && (lower2 <= upper1 && upper1 <= upper2) |
524 | } |
525 | } |
526 | |
527 | pub trait Bound: |
528 | Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord |
529 | { |
530 | fn min_value() -> Self; |
531 | fn max_value() -> Self; |
532 | fn as_u32(self) -> u32; |
533 | fn increment(self) -> Self; |
534 | fn decrement(self) -> Self; |
535 | } |
536 | |
537 | impl Bound for u8 { |
538 | fn min_value() -> Self { |
539 | u8::MIN |
540 | } |
541 | fn max_value() -> Self { |
542 | u8::MAX |
543 | } |
544 | fn as_u32(self) -> u32 { |
545 | u32::from(self) |
546 | } |
547 | fn increment(self) -> Self { |
548 | self.checked_add(1).unwrap() |
549 | } |
550 | fn decrement(self) -> Self { |
551 | self.checked_sub(1).unwrap() |
552 | } |
553 | } |
554 | |
555 | impl Bound for char { |
556 | fn min_value() -> Self { |
557 | ' \x00' |
558 | } |
559 | fn max_value() -> Self { |
560 | ' \u{10FFFF}' |
561 | } |
562 | fn as_u32(self) -> u32 { |
563 | u32::from(self) |
564 | } |
565 | |
566 | fn increment(self) -> Self { |
567 | match self { |
568 | ' \u{D7FF}' => ' \u{E000}' , |
569 | c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(), |
570 | } |
571 | } |
572 | |
573 | fn decrement(self) -> Self { |
574 | match self { |
575 | ' \u{E000}' => ' \u{D7FF}' , |
576 | c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(), |
577 | } |
578 | } |
579 | } |
580 | |
581 | // Tests for interval sets are written in src/hir.rs against the public API. |
582 | |