Fix handling of negative slices

Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
modin-project · Nov 18, 2021 · 40885a3 · 40885a3
1 parent a2f3d7f
commit 40885a3
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 10 deletions.
diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py
@@ -588,10 +588,8 @@ def _handle_boolean_masking(self, row_loc, col_loc):
             failure_condition=not isinstance(row_loc, Series),
             extra_log=f"Only ``modin.pandas.Series`` boolean masks are acceptable, got: {type(row_loc)}",
         )
-
-        aligned_loc = row_loc.reindex(index=self.df.index)
         masked_df = self.df.__constructor__(
-            query_compiler=self.qc.getitem_array(aligned_loc._query_compiler)
+            query_compiler=self.qc.getitem_array(row_loc._query_compiler)
         )
         # Passing `slice(None)` as a row indexer since we've just applied it
         return type(self)(masked_df)[(slice(None), col_loc)]
@@ -790,13 +788,24 @@ def _compute_lookup(self, row_loc, col_loc):
                 if isinstance(axis_loc, slice) and axis_loc == slice(None):
                     axis_lookup = axis_loc
                 else:
-                    axis_lookup = self.qc.get_axis(axis).slice_indexer(
+                    axis_labels = self.qc.get_axis(axis)
+                    # `slice_indexer` returns a fully-defined numeric slice for a non-fully-defined labels-based slice
+                    axis_lookup = axis_labels.slice_indexer(
                         axis_loc.start, axis_loc.stop, axis_loc.step
                     )
+                    # Converting negative indices to its actual positions:
                     axis_lookup = pandas.RangeIndex(
-                        axis_lookup.start,
-                        axis_lookup.stop,
-                        axis_lookup.step,
+                        start=(
+                            axis_lookup.start
+                            if axis_lookup.start >= 0
+                            else axis_lookup.start + len(axis_labels)
+                        ),
+                        stop=(
+                            axis_lookup.stop
+                            if axis_lookup.stop >= 0
+                            else axis_lookup.stop + len(axis_labels)
+                        ),
+                        step=axis_lookup.step,
                     )
             elif self.qc.has_multiindex(axis):
                 # `Index.get_locs` raises an IndexError by itself if missing labels were provided,
@@ -808,14 +817,15 @@ def _compute_lookup(self, row_loc, col_loc):
             elif is_boolean_array(axis_loc):
                 axis_lookup = boolean_mask_to_numeric(axis_loc)
             else:
+                axis_labels = self.qc.get_axis(axis)
                 if is_list_like(axis_loc) and not isinstance(
                     axis_loc, (np.ndarray, pandas.Index)
                 ):
                     # `Index.get_indexer_for` works much faster with numpy arrays than with python lists,
                     # so although we lose some time here on converting to numpy, `Index.get_indexer_for`
                     # speedup covers the loss that we gain here.
-                    axis_loc = np.array(axis_loc)
-                axis_lookup = self.qc.get_axis(axis).get_indexer_for(axis_loc)
+                    axis_loc = np.array(axis_loc, dtype=axis_labels.dtype)
+                axis_lookup = axis_labels.get_indexer_for(axis_loc)
                 # `Index.get_indexer_for` sets -1 value for missing labels, we have to verify whether
                 # there are any -1 in the received indexer to raise a KeyError here.
                 missing_mask = axis_lookup < 0
@@ -963,7 +973,7 @@ def _compute_lookup(self, row_loc, col_loc):
                     # `Index.__getitem__` works much faster with numpy arrays than with python lists,
                     # so although we lose some time here on converting to numpy, `Index.__getitem__`
                     # speedup covers the loss that we gain here.
-                    axis_loc = np.array(axis_loc)
+                    axis_loc = np.array(axis_loc, dtype=np.int64)
                 # Relatively fast check allows us to not trigger `self.qc.get_axis()` computation
                 # if there're no negative indices and so they're not depend on the axis length.
                 if isinstance(axis_loc, np.ndarray) and not (axis_loc < 0).any():

diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py
@@ -574,6 +574,29 @@ def test_loc_series():
     df_equals(pd_df, md_df)
 
 
+@pytest.mark.parametrize("locator_name", ["loc", "iloc"])
+@pytest.mark.parametrize(
+    "slice_indexer",
+    [
+        slice(None, None, -2),
+        slice(1, 10, None),
+        slice(None, 10, None),
+        slice(10, None, None),
+        slice(10, None, -2),
+        slice(-10, None, -2),
+        slice(None, 1_000_000_000, None),
+    ],
+)
+def test_loc_iloc_slice_indexer(locator_name, slice_indexer):
+    md_df, pd_df = create_test_dfs(test_data_values[0])
+    # Shifting the index, so labels won't match its position
+    shifted_index = pandas.RangeIndex(1, len(md_df) + 1)
+    md_df.index = shifted_index
+    pd_df.index = shifted_index
+
+    eval_general(md_df, pd_df, lambda df: getattr(df, locator_name)[slice_indexer])
+
+
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 def test_pop(request, data):
     modin_df = pd.DataFrame(data)