Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: Enforce disallowed merging scenarios #49429

Merged
merged 5 commits into from
Nov 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,9 @@ Removal of prior version deprecations/changes
- Enforced disallowing a string column label into ``times`` in :meth:`DataFrame.ewm` (:issue:`43265`)
- Enforced disallowing a tuple of column labels into :meth:`.DataFrameGroupBy.__getitem__` (:issue:`30546`)
- Enforced disallowing setting values with ``.loc`` using a positional slice. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`)
- Enforced disallowing ``dict`` or ``set`` objects in ``suffixes`` in :func:`merge` (:issue:`34810`)
- Enforced disallowing :func:`merge` to produce duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`)
- Enforced disallowing using :func:`merge` or :func:`join` on a different number of levels (:issue:`34862`)
- Removed setting Categorical._codes directly (:issue:`41429`)
- Removed setting Categorical.categories directly (:issue:`47834`)
- Removed argument ``inplace`` from :meth:`Categorical.add_categories`, :meth:`Categorical.remove_categories`, :meth:`Categorical.set_categories`, :meth:`Categorical.rename_categories`, :meth:`Categorical.reorder_categories`, :meth:`Categorical.set_ordered`, :meth:`Categorical.as_ordered`, :meth:`Categorical.as_unordered` (:issue:`37981`, :issue:`41118`, :issue:`41133`, :issue:`47834`)
Expand Down
29 changes: 11 additions & 18 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,16 +676,14 @@ def __init__(
f"right_index parameter must be of type bool, not {type(right_index)}"
)

# warn user when merging between different levels
# GH 40993: raise when merging between different levels; enforced in 2.0
if _left.columns.nlevels != _right.columns.nlevels:
msg = (
"merging between different levels is deprecated and will be removed "
f"in a future version. ({_left.columns.nlevels} levels on the left, "
"Not allowed to merge between different levels. "
f"({_left.columns.nlevels} levels on the left, "
f"{_right.columns.nlevels} on the right)"
)
# stacklevel chosen to be correct when this is reached via pd.merge
# (and not DataFrame.join)
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
raise MergeError(msg)

self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)

Expand Down Expand Up @@ -2475,13 +2473,10 @@ def _items_overlap_with_suffix(
If corresponding suffix is empty, the entry is simply converted to string.

"""
if not is_list_like(suffixes, allow_sets=False):
warnings.warn(
f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give "
"unexpected results. Provide 'suffixes' as a tuple instead. In the "
"future a 'TypeError' will be raised.",
FutureWarning,
stacklevel=find_stack_level(),
if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict):
raise TypeError(
f"Passing 'suffixes' as a {type(suffixes)}, is not supported. "
"Provide 'suffixes' as a tuple instead."
)

to_rename = left.intersection(right)
Expand Down Expand Up @@ -2527,11 +2522,9 @@ def renamer(x, suffix):
if not rlabels.is_unique:
dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())
if dups:
warnings.warn(
f"Passing 'suffixes' which cause duplicate columns {set(dups)} in the "
f"result is deprecated and will raise a MergeError in a future version.",
FutureWarning,
stacklevel=find_stack_level(),
raise MergeError(
f"Passing 'suffixes' which cause duplicate columns {set(dups)} is "
f"not allowed.",
)

return llabels, rlabels
23 changes: 10 additions & 13 deletions pandas/tests/frame/methods/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,8 +516,9 @@ def test_join_multiindex_dates(self):

tm.assert_equal(result, expected)

def test_merge_join_different_levels(self):
def test_merge_join_different_levels_raises(self):
# GH#9455
# GH 40993: For raising, enforced in 2.0

# first dataframe
df1 = DataFrame(columns=["a", "b"], data=[[1, 11], [0, 22]])
Expand All @@ -527,20 +528,16 @@ def test_merge_join_different_levels(self):
df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]])

# merge
columns = ["a", "b", ("c", "c1")]
expected = DataFrame(columns=columns, data=[[1, 11, 33], [0, 22, 44]])
with tm.assert_produces_warning(FutureWarning):
result = pd.merge(df1, df2, on="a")
tm.assert_frame_equal(result, expected)
with pytest.raises(
MergeError, match="Not allowed to merge between different levels"
):
pd.merge(df1, df2, on="a")

# join, see discussion in GH#12219
columns = ["a", "b", ("a", ""), ("c", "c1")]
expected = DataFrame(columns=columns, data=[[1, 11, 0, 44], [0, 22, 1, 33]])
msg = "merging between different levels is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
# stacklevel is chosen to be correct for pd.merge, not DataFrame.join
result = df1.join(df2, on="a")
tm.assert_frame_equal(result, expected)
with pytest.raises(
MergeError, match="Not allowed to merge between different levels"
):
df1.join(df2, on="a")

def test_frame_join_tzaware(self):
test1 = DataFrame(
Expand Down
23 changes: 12 additions & 11 deletions pandas/tests/reshape/merge/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,18 +420,18 @@ def test_join_inner_multiindex(self, lexsorted_two_level_string_multiindex):

# _assert_same_contents(expected, expected2.loc[:, expected.columns])

def test_join_hierarchical_mixed(self):
def test_join_hierarchical_mixed_raises(self):
# GH 2024
# GH 40993: For raising, enforced in 2.0
df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"])
new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]})
other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"])
other_df.set_index("a", inplace=True)
# GH 9455, 12219
msg = "merging between different levels is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = merge(new_df, other_df, left_index=True, right_index=True)
assert ("b", "mean") in result
assert "b" in result
with pytest.raises(
pd.errors.MergeError, match="Not allowed to merge between different levels"
):
merge(new_df, other_df, left_index=True, right_index=True)

def test_join_float64_float32(self):

Expand Down Expand Up @@ -642,11 +642,12 @@ def test_join_dups(self):
dta = x.merge(y, left_index=True, right_index=True).merge(
z, left_index=True, right_index=True, how="outer"
)
with tm.assert_produces_warning(FutureWarning):
dta = dta.merge(w, left_index=True, right_index=True)
expected = concat([x, y, z, w], axis=1)
expected.columns = ["x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"]
tm.assert_frame_equal(dta, expected)
# GH 40991: As of 2.0 causes duplicate columns
with pytest.raises(
pd.errors.MergeError,
match="Passing 'suffixes' which cause duplicate columns",
):
dta.merge(w, left_index=True, right_index=True)

def test_join_multi_to_multi(self, join_type):
# GH 20475
Expand Down
44 changes: 18 additions & 26 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2207,6 +2207,7 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm):

def test_merge_series_multilevel():
# GH#47946
# GH 40993: For raising, enforced in 2.0
a = DataFrame(
{"A": [1, 2, 3, 4]},
index=MultiIndex.from_product([["a", "b"], [0, 1]], names=["outer", "inner"]),
Expand All @@ -2216,13 +2217,10 @@ def test_merge_series_multilevel():
index=MultiIndex.from_product([["a", "b"], [1, 2]], names=["outer", "inner"]),
name=("B", "C"),
)
expected = DataFrame(
{"A": [2, 4], ("B", "C"): [1, 3]},
index=MultiIndex.from_product([["a", "b"], [1]], names=["outer", "inner"]),
)
with tm.assert_produces_warning(FutureWarning):
result = merge(a, b, on=["outer", "inner"])
tm.assert_frame_equal(result, expected)
with pytest.raises(
MergeError, match="Not allowed to merge between different levels"
):
merge(a, b, on=["outer", "inner"])


@pytest.mark.parametrize(
Expand Down Expand Up @@ -2303,12 +2301,12 @@ def test_merge_suffix_error(col1, col2, suffixes):


@pytest.mark.parametrize("suffixes", [{"left", "right"}, {"left": 0, "right": 0}])
def test_merge_suffix_warns(suffixes):
def test_merge_suffix_raises(suffixes):
a = DataFrame({"a": [1, 2, 3]})
b = DataFrame({"b": [3, 4, 5]})

with tm.assert_produces_warning(FutureWarning):
merge(a, b, left_index=True, right_index=True, suffixes={"left", "right"})
with pytest.raises(TypeError, match="Passing 'suffixes' as a"):
merge(a, b, left_index=True, right_index=True, suffixes=suffixes)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -2609,20 +2607,16 @@ def test_merge_result_empty_index_and_on():
tm.assert_frame_equal(result, expected)


def test_merge_suffixes_produce_dup_columns_warns():
# GH#22818
def test_merge_suffixes_produce_dup_columns_raises():
# GH#22818; Enforced in 2.0
left = DataFrame({"a": [1, 2, 3], "b": 1, "b_x": 2})
right = DataFrame({"a": [1, 2, 3], "b": 2})
expected = DataFrame(
[[1, 1, 2, 2], [2, 1, 2, 2], [3, 1, 2, 2]], columns=["a", "b_x", "b_x", "b_y"]
)
with tm.assert_produces_warning(FutureWarning):
result = merge(left, right, on="a")
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(FutureWarning):
with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"):
merge(left, right, on="a")

with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"):
merge(right, left, on="a", suffixes=("_y", "_x"))
tm.assert_frame_equal(result, expected)


def test_merge_duplicate_columns_with_suffix_no_warning():
Expand All @@ -2635,15 +2629,13 @@ def test_merge_duplicate_columns_with_suffix_no_warning():
tm.assert_frame_equal(result, expected)


def test_merge_duplicate_columns_with_suffix_causing_another_duplicate():
# GH#22818
def test_merge_duplicate_columns_with_suffix_causing_another_duplicate_raises():
# GH#22818, Enforced in 2.0
# This should raise warning because suffixes cause another collision
left = DataFrame([[1, 1, 1, 1], [2, 2, 2, 2]], columns=["a", "b", "b", "b_x"])
right = DataFrame({"a": [1, 3], "b": 2})
with tm.assert_produces_warning(FutureWarning):
result = merge(left, right, on="a")
expected = DataFrame([[1, 1, 1, 1, 2]], columns=["a", "b_x", "b_x", "b_x", "b_y"])
tm.assert_frame_equal(result, expected)
with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"):
merge(left, right, on="a")


def test_merge_string_float_column_result():
Expand Down