Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX-#4456: fix loc in case when need reindex item #4457

Merged
merged 11 commits into from
May 26, 2022
1 change: 1 addition & 0 deletions docs/release_notes/release_notes-0.15.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Key Features and Updates
* FIX-#4373: Fix invalid file path when trying `read_csv_glob` with `usecols` parameter (#4405)
* FIX-#4394: Fix issue with multiindex metadata desync (#4395)
* FIX-#4425: Add parameters to groupby pct_change (#4429)
* FIX-#4457: Fix `loc` in case when need reindex item (#4457)
* Performance enhancements
* FEAT-#4320: Add connectorx as an alternative engine for read_sql (#4346)
* Benchmarking enhancements
Expand Down
26 changes: 12 additions & 14 deletions modin/pandas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,23 +424,21 @@ def _broadcast_item(self, row_lookup, col_lookup, item, to_shape):
# the target the user is trying to overwrite. This
if isinstance(item, (pandas.Series, pandas.DataFrame, Series, DataFrame)):
# convert indices in lookups to names, as Pandas reindex expects them to be so
kw = {}
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
index_values = self.qc.index[row_lookup]
if not all(idx in item.index for idx in index_values):
raise ValueError(
"Must have equal len keys and value when setting with "
+ "an iterable"
)
if len(index_values) < len(item.index) or not all(
idx in item.index for idx in index_values
):
kw["index"] = index_values
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
if hasattr(item, "columns"):
column_values = self.qc.columns[col_lookup]
if not all(col in item.columns for col in column_values):
# TODO: think if it is needed to handle cases when columns have duplicate names
raise ValueError(
"Must have equal len keys and value when setting "
+ "with an iterable"
)
item = item.reindex(index=index_values, columns=column_values)
else:
item = item.reindex(index=index_values)
if len(column_values) < len(item.columns) or not all(
col in item.columns for col in column_values
):
kw["columns"] = column_values
# New value for columns/index make that reindex add NaN values
if kw:
item = item.reindex(**kw)
try:
item = np.array(item)
if np.prod(to_shape) == np.prod(item.shape):
Expand Down
58 changes: 58 additions & 0 deletions modin/pandas/test/dataframe/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,24 @@ def eval_setitem(md_df, pd_df, value, col=None, loc=None):
)


def eval_loc(md_df, pd_df, value, key):
if isinstance(value, tuple):
assert len(value) == 2
# case when value for pandas different
md_value, pd_value = value
else:
md_value, pd_value = value, value

eval_general(
md_df,
pd_df,
lambda df: df.loc.__setitem__(
key, pd_value if isinstance(df, pandas.DataFrame) else md_value
),
__inplace__=True,
)


@pytest.mark.parametrize(
"dates",
[
Expand Down Expand Up @@ -384,6 +402,46 @@ def test_loc(data):
modin_df.loc["NO_EXIST"]


def test_loc_4456():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a chance we can set a pair of (value, key) as a test parameter in order to reduce it? for now the test looks a bit complicated

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like I've made it a bit easier :)

data = test_data["float_nan_data"]
modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)

key = ["col1", "col2"]

# len(key) == df_value.columns; different columns
value = [[1, 2]] * pandas_df.shape[0]
df_value = pandas.DataFrame(value, columns=["value_col1", "value_col2"])
eval_loc(modin_df, pandas_df, df_value, (slice(None), key))

# len(key) == df_value.columns; different columns; modin DataFrame
mdf_value = pd.DataFrame(df_value)
eval_loc(modin_df, pandas_df, (mdf_value, df_value), (slice(None), key))

# len(key) == df_value.columns; different index
df_value = pandas.DataFrame(
value, columns=key, index=list(reversed(pandas_df.index))
)
eval_loc(modin_df, pandas_df, df_value, (slice(None), key))

# len(key) > df_value.columns
value = [[1]] * pandas_df.shape[0]
df_value = pandas.DataFrame(value, columns=key[:1])
eval_loc(modin_df, pandas_df, df_value, (slice(None), key))

# len(key) == df_value.columns; different columns; modin Series
# import pdb; pdb.set_trace()
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
md_series_value = mdf_value[mdf_value.columns[0]]
pd_series_value = df_value[df_value.columns[0]]
eval_loc(
modin_df, pandas_df, (md_series_value, pd_series_value), (slice(None), key)
)

# len(key) < df_value.columns
value = [[1, 2, 3]] * pandas_df.shape[0]
df_value = pandas.DataFrame(value, columns=key + ["col3"])
eval_loc(modin_df, pandas_df, df_value, (slice(None), key))


# This tests the bug from https://github.com/modin-project/modin/issues/3736
def test_loc_setting_single_categorical_column():
modin_df = pd.DataFrame({"status": ["a", "b", "c"]}, dtype="category")
Expand Down