diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index e3051bf31..e063390a8 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -19,7 +19,7 @@ use crate::unicode; // // Some of the implementation complexity here is a result of me wanting to // preserve the sequential representation without using additional memory. -// In some cases, we do use linear extra memory, but it is at most 2x and it +// In many cases, we do use linear extra memory, but it is at most 2x and it // is amortized. If we relaxed the memory requirements, this implementation // could become much simpler. The extra memory is honestly probably OK, but // character classes (especially of the Unicode variety) can become quite @@ -81,45 +81,14 @@ impl IntervalSet { /// Add a new interval to this set. pub fn push(&mut self, interval: I) { + // TODO: This could be faster. e.g., Push the interval such that + // it preserves canonicalization. + self.ranges.push(interval); + self.canonicalize(); // We don't know whether the new interval added here is considered // case folded, so we conservatively assume that the entire set is // no longer case folded if it was previously. self.folded = false; - - if self.ranges.is_empty() { - self.ranges.push(interval); - return; - } - - // Find the first range that is not greater than the new interval. - // This is the first range that could possibly be unioned with the - // new interval. - let mut drain_end = self.ranges.len(); - while drain_end > 0 - && self.ranges[drain_end - 1].lower() > interval.upper() - && !self.ranges[drain_end - 1].is_contiguous(&interval) - { - drain_end -= 1; - } - - // Try to union the new interval with old intervals backwards. - if drain_end > 0 && self.ranges[drain_end - 1].is_contiguous(&interval) - { - self.ranges[drain_end - 1] = - self.ranges[drain_end - 1].union(&interval).unwrap(); - for i in (0..drain_end - 1).rev() { - if let Some(union) = - self.ranges[drain_end - 1].union(&self.ranges[i]) - { - self.ranges[drain_end - 1] = union; - } else { - self.ranges.drain(i + 1..drain_end - 1); - break; - } - } - } else { - self.ranges.insert(drain_end, interval); - } } /// Return an iterator over all intervals in this set. @@ -223,13 +192,34 @@ impl IntervalSet { // Folks seem to suggest interval or segment trees, but I'd like to // avoid the overhead (both runtime and conceptual) of that. // + // The following is basically my Shitty First Draft. Therefore, in + // order to grok it, you probably need to read each line carefully. + // Simplifications are most welcome! + // // Remember, we can assume the canonical format invariant here, which // says that all ranges are sorted, not overlapping and not adjacent in // each class. let drain_end = self.ranges.len(); + let (mut a, mut b) = (0, 0); + 'LOOP: while a < drain_end && b < other.ranges.len() { + // Basically, the easy cases are when neither range overlaps with + // each other. If the `b` range is less than our current `a` + // range, then we can skip it and move on. + if other.ranges[b].upper() < self.ranges[a].lower() { + b += 1; + continue; + } + // ... similarly for the `a` range. If it's less than the smallest + // `b` range, then we can add it as-is. + if self.ranges[a].upper() < other.ranges[b].lower() { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; + continue; + } + // Otherwise, we have overlapping ranges. + assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); - let mut b = 0; - for a in 0..drain_end { // This part is tricky and was non-obvious to me without looking // at explicit examples (see the tests). The trickiness stems from // two things: 1) subtracting a range from another range could @@ -241,34 +231,47 @@ impl IntervalSet { // For example, if our `a` range is `a-t` and our next three `b` // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply // subtraction three times before moving on to the next `a` range. - self.ranges.push(self.ranges[a]); - // Only when `b` is not above `a`, `b` might apply to current - // `a` range. + let mut range = self.ranges[a]; while b < other.ranges.len() - && other.ranges[b].lower() <= self.ranges[a].upper() + && !range.is_intersection_empty(&other.ranges[b]) { - match self.ranges.pop().unwrap().difference(&other.ranges[b]) { - (Some(range1), None) | (None, Some(range1)) => { - self.ranges.push(range1); + let old_range = range; + range = match range.difference(&other.ranges[b]) { + (None, None) => { + // We lost the entire range, so move on to the next + // without adding this one. + a += 1; + continue 'LOOP; } + (Some(range1), None) | (None, Some(range1)) => range1, (Some(range1), Some(range2)) => { self.ranges.push(range1); - self.ranges.push(range2); + range2 } - (None, None) => {} + }; + // It's possible that the `b` range has more to contribute + // here. In particular, if it is greater than the original + // range, then it might impact the next `a` range *and* it + // has impacted the current `a` range as much as possible, + // so we can quit. We don't bump `b` so that the next `a` + // range can apply it. + if other.ranges[b].upper() > old_range.upper() { + break; } - // The next `b` range might apply to the current + // Otherwise, the next `b` range might apply to the current // `a` range. b += 1; } - // It's possible that the last `b` range has more to - // contribute to the next `a`. We don't bump the last - // `b` so that the next `a` range can apply it. - b = b.saturating_sub(1); + self.ranges.push(range); + a += 1; + } + while a < drain_end { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; } - self.ranges.drain(..drain_end); - self.folded = self.ranges.is_empty() || (self.folded && other.folded); + self.folded = self.folded && other.folded; } /// Compute the symmetric difference of the two sets, in place. @@ -279,83 +282,11 @@ impl IntervalSet { /// set. That is, the set will contain all elements in either set, /// but will not contain any elements that are in both sets. pub fn symmetric_difference(&mut self, other: &IntervalSet) { - if self.ranges.is_empty() { - self.ranges.extend(&other.ranges); - self.folded = other.folded; - return; - } - if other.ranges.is_empty() { - return; - } - - // There should be a way to do this in-place with constant memory, - // but I couldn't figure out a simple way to do it. So just append - // the symmetric difference to the end of this range, and then drain - // it before we're done. - let drain_end = self.ranges.len(); - let mut b = 0; - let mut b_range = Some(other.ranges[b]); - for a in 0..drain_end { - self.ranges.push(self.ranges[a]); - while b_range - .map_or(false, |r| r.lower() <= self.ranges[a].upper()) - { - let (range1, range2) = match self - .ranges - .pop() - .unwrap() - .symmetric_difference(&b_range.as_ref().unwrap()) - { - (Some(range1), None) | (None, Some(range1)) => { - (Some(range1), None) - } - (Some(range1), Some(range2)) => { - (Some(range1), Some(range2)) - } - (None, None) => (None, None), - }; - if let Some(range) = range1 { - if self.ranges.len() > drain_end - && self.ranges.last().unwrap().is_contiguous(&range) - { - self.ranges - .last_mut() - .map(|last| *last = last.union(&range).unwrap()); - } else { - self.ranges.push(range); - } - } - if let Some(range) = range2 { - self.ranges.push(range); - } - - b_range = if self.ranges.len() > drain_end - && self.ranges.last().unwrap().upper() - > self.ranges[a].upper() - { - Some(*self.ranges.last().unwrap()) - } else { - b += 1; - other.ranges.get(b).cloned() - }; - } - } - while let Some(range) = b_range { - if self.ranges.len() > drain_end - && self.ranges.last().unwrap().is_contiguous(&range) - { - self.ranges - .last_mut() - .map(|last| *last = last.union(&range).unwrap()); - } else { - self.ranges.push(range); - } - b += 1; - b_range = other.ranges.get(b).cloned(); - } - - self.ranges.drain(..drain_end); - self.folded = self.ranges.is_empty() || (self.folded && other.folded); + // TODO(burntsushi): Fix this so that it amortizes allocation. + let mut intersection = self.clone(); + intersection.intersect(other); + self.union(other); + self.difference(&intersection); } /// Negate this interval set. @@ -371,44 +302,28 @@ impl IntervalSet { return; } + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the negation to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + // We do checked arithmetic below because of the canonical ordering // invariant. if self.ranges[0].lower() > I::Bound::min_value() { - let mut pre_upper = self.ranges[0].upper(); - self.ranges[0] = I::create( - I::Bound::min_value(), - self.ranges[0].lower().decrement(), - ); - for i in 1..self.ranges.len() { - let lower = pre_upper.increment(); - pre_upper = self.ranges[i].upper(); - self.ranges[i] = - I::create(lower, self.ranges[i].lower().decrement()); - } - if pre_upper < I::Bound::max_value() { - self.ranges.push(I::create( - pre_upper.increment(), - I::Bound::max_value(), - )); - } - } else { - for i in 1..self.ranges.len() { - self.ranges[i - 1] = I::create( - self.ranges[i - 1].upper().increment(), - self.ranges[i].lower().decrement(), - ); - } - if self.ranges.last().unwrap().upper() < I::Bound::max_value() { - self.ranges.last_mut().map(|range| { - *range = I::create( - range.upper().increment(), - I::Bound::max_value(), - ) - }); - } else { - self.ranges.pop(); - } + let upper = self.ranges[0].lower().decrement(); + self.ranges.push(I::create(I::Bound::min_value(), upper)); + } + for i in 1..drain_end { + let lower = self.ranges[i - 1].upper().increment(); + let upper = self.ranges[i].lower().decrement(); + self.ranges.push(I::create(lower, upper)); + } + if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { + let lower = self.ranges[drain_end - 1].upper().increment(); + self.ranges.push(I::create(lower, I::Bound::max_value())); } + self.ranges.drain(..drain_end); // We don't need to update whether this set is folded or not, because // it is conservatively preserved through negation. Namely, if a set // is not folded, then it is possible that its negation is folded, for @@ -422,7 +337,6 @@ impl IntervalSet { // of case folded characters. Negating it in turn means that all // equivalence classes in the set are negated, and any equivalence // class that was previously not in the set is now entirely in the set. - self.folded = self.ranges.is_empty() || self.folded; } /// Converts this set into a canonical ordering. @@ -433,20 +347,24 @@ impl IntervalSet { self.ranges.sort(); assert!(!self.ranges.is_empty()); - // We maintain the canonicalization results in-place at `0..newi`. - // `newi` will keep track of the end of the canonicalized ranges. - let mut newi = 0; - for oldi in 1..self.ranges.len() { - // The last new range gets merged with currnet old range when - // unionable. If not, we update `newi` and store it as a new range. - if let Some(union) = self.ranges[newi].union(&self.ranges[oldi]) { - self.ranges[newi] = union; - } else { - newi += 1; - self.ranges[newi] = self.ranges[oldi]; + // Is there a way to do this in-place with constant memory? I couldn't + // figure out a way to do it. So just append the canonicalization to + // the end of this range, and then drain it before we're done. + let drain_end = self.ranges.len(); + for oldi in 0..drain_end { + // If we've added at least one new range, then check if we can + // merge this range in the previously added range. + if self.ranges.len() > drain_end { + let (last, rest) = self.ranges.split_last_mut().unwrap(); + if let Some(union) = last.union(&rest[oldi]) { + *last = union; + continue; + } } + let range = self.ranges[oldi]; + self.ranges.push(range); } - self.ranges.truncate(newi + 1); + self.ranges.drain(..drain_end); } /// Returns true if and only if this class is in a canonical ordering. @@ -568,13 +486,7 @@ pub trait Interval: other: &Self, ) -> (Option, Option) { let union = match self.union(other) { - None => { - return if self.upper() < other.lower() { - (Some(self.clone()), Some(other.clone())) - } else { - (Some(other.clone()), Some(self.clone())) - } - } + None => return (Some(self.clone()), Some(other.clone())), Some(union) => union, }; let intersection = match self.intersect(other) {