From 033041d8e9659b7ceaf37d3c1039176ab632a22f Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 1 Jul 2021 12:18:52 -0700 Subject: [PATCH] Backport PR #42334: PERF/REGR: restore IntervalIndex._intersection_non_unique (#42336) Co-authored-by: jbrockmendel --- pandas/core/indexes/interval.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 845d21e18d287..94d7814151a25 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -808,7 +808,8 @@ def _intersection(self, other, sort): # multiple NaNs taken = other._intersection_unique(self) else: - return super()._intersection(other, sort) + # duplicates + taken = self._intersection_non_unique(other) if sort is None: taken = taken.sort_values() @@ -837,6 +838,35 @@ def _intersection_unique(self, other: IntervalIndex) -> IntervalIndex: return self.take(indexer) + def _intersection_non_unique(self, other: IntervalIndex) -> IntervalIndex: + """ + Used when the IntervalIndex does have some common endpoints, + on either sides. + Return the intersection with another IntervalIndex. + + Parameters + ---------- + other : IntervalIndex + + Returns + ------- + IntervalIndex + """ + # Note: this is about 3.25x faster than super()._intersection(other) + # in IntervalIndexMethod.time_intersection_both_duplicate(1000) + mask = np.zeros(len(self), dtype=bool) + + if self.hasnans and other.hasnans: + first_nan_loc = np.arange(len(self))[self.isna()][0] + mask[first_nan_loc] = True + + other_tups = set(zip(other.left, other.right)) + for i, tup in enumerate(zip(self.left, self.right)): + if tup in other_tups: + mask[i] = True + + return self[mask] + # -------------------------------------------------------------------- @property