Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX-#4456: fix loc in case when need reindex item #4457

Merged
merged 11 commits into from
May 26, 2022
1 change: 1 addition & 0 deletions docs/release_notes/release_notes-0.15.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Key Features and Updates
* FIX-#4373: Fix invalid file path when trying `read_csv_glob` with `usecols` parameter (#4405)
* FIX-#4394: Fix issue with multiindex metadata desync (#4395)
* FIX-#4425: Add parameters to groupby pct_change (#4429)
* FIX-#4457: Fix `loc` in case when need reindex item (#4457)
* FIX-#4414: Add missing f prefix on f-strings found at https://codereview.doctor (#4415)
* FIX-#4461: Fix S3 CSV data path (#4462)
* Performance enhancements
Expand Down
22 changes: 8 additions & 14 deletions modin/pandas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,23 +424,17 @@ def _broadcast_item(self, row_lookup, col_lookup, item, to_shape):
# the target the user is trying to overwrite. This
if isinstance(item, (pandas.Series, pandas.DataFrame, Series, DataFrame)):
# convert indices in lookups to names, as Pandas reindex expects them to be so
axes_to_reindex = {}
index_values = self.qc.index[row_lookup]
if not all(idx in item.index for idx in index_values):
raise ValueError(
"Must have equal len keys and value when setting with "
+ "an iterable"
)
if not index_values.equals(item.index):
axes_to_reindex["index"] = index_values
if hasattr(item, "columns"):
column_values = self.qc.columns[col_lookup]
if not all(col in item.columns for col in column_values):
# TODO: think if it is needed to handle cases when columns have duplicate names
raise ValueError(
"Must have equal len keys and value when setting "
+ "with an iterable"
)
item = item.reindex(index=index_values, columns=column_values)
else:
item = item.reindex(index=index_values)
if not column_values.equals(item.columns):
axes_to_reindex["columns"] = column_values
# New value for columns/index make that reindex add NaN values
if axes_to_reindex:
item = item.reindex(**axes_to_reindex)
try:
item = np.array(item)
if np.prod(to_shape) == np.prod(item.shape):
Expand Down
88 changes: 88 additions & 0 deletions modin/pandas/test/dataframe/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from modin.config import NPartitions
from modin.utils import get_current_execution
from modin.test.test_utils import warns_that_defaulting_to_pandas
from modin.pandas.indexing import is_range_like

NPartitions.put(4)

Expand All @@ -67,6 +68,24 @@ def eval_setitem(md_df, pd_df, value, col=None, loc=None):
)


def eval_loc(md_df, pd_df, value, key):
if isinstance(value, tuple):
assert len(value) == 2
# case when value for pandas different
md_value, pd_value = value
else:
md_value, pd_value = value, value

eval_general(
md_df,
pd_df,
lambda df: df.loc.__setitem__(
key, pd_value if isinstance(df, pandas.DataFrame) else md_value
),
__inplace__=True,
)


@pytest.mark.parametrize(
"dates",
[
Expand Down Expand Up @@ -384,6 +403,75 @@ def test_loc(data):
modin_df.loc["NO_EXIST"]


@pytest.mark.parametrize(
"key_getter, value_getter",
[
pytest.param(
lambda df, axis: (
(slice(None), df.axes[axis][:2])
if axis
else (df.axes[axis][:2], slice(None))
),
lambda df, axis: df.iloc[:, :1] if axis else df.iloc[:1, :],
id="len(key)_>_len(value)",
),
pytest.param(
lambda df, axis: (
(slice(None), df.axes[axis][:2])
if axis
else (df.axes[axis][:2], slice(None))
),
lambda df, axis: df.iloc[:, :3] if axis else df.iloc[:3, :],
id="len(key)_<_len(value)",
),
pytest.param(
lambda df, axis: (
(slice(None), df.axes[axis][:2])
if axis
else (df.axes[axis][:2], slice(None))
),
lambda df, axis: df.iloc[:, :2] if axis else df.iloc[:2, :],
id="len(key)_==_len(value)",
),
],
)
@pytest.mark.parametrize("key_axis", [0, 1])
@pytest.mark.parametrize("reverse_value_index", [True, False])
@pytest.mark.parametrize("reverse_value_columns", [True, False])
def test_loc_4456(
key_getter, value_getter, key_axis, reverse_value_index, reverse_value_columns
):
data = test_data["float_nan_data"]
modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)

key = key_getter(pandas_df, key_axis)

# `df.loc` doesn't work right for range-like indexers. Converting them to a list.
# https://github.com/modin-project/modin/issues/4497
if is_range_like(key[0]):
key = (list(key[0]), key[1])
if is_range_like(key[1]):
key = (key[0], list(key[1]))

value = pandas.DataFrame(
np.random.randint(0, 100, size=pandas_df.shape),
index=pandas_df.index,
columns=pandas_df.columns,
)
pdf_value = value_getter(value, key_axis)
mdf_value = value_getter(pd.DataFrame(value), key_axis)

if reverse_value_index:
pdf_value = pdf_value.reindex(index=pdf_value.index[::-1])
mdf_value = mdf_value.reindex(index=mdf_value.index[::-1])
if reverse_value_columns:
pdf_value = pdf_value.reindex(columns=pdf_value.columns[::-1])
mdf_value = mdf_value.reindex(columns=mdf_value.columns[::-1])

eval_loc(modin_df, pandas_df, pdf_value, key)
eval_loc(modin_df, pandas_df, (mdf_value, pdf_value), key)


# This tests the bug from https://github.com/modin-project/modin/issues/3736
def test_loc_setting_single_categorical_column():
modin_df = pd.DataFrame({"status": ["a", "b", "c"]}, dtype="category")
Expand Down