Skip to content

Commit

Permalink
DEPR: Enforce disallowed merging scenarios (pandas-dev#49429)
Browse files Browse the repository at this point in the history
* Enforce merge suffixes tuples

* Duplicate merge columns

* enforce disallowing different levels

* Fix test
  • Loading branch information
mroeschke authored and noatamir committed Nov 9, 2022
1 parent 875bc5f commit 713b209
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 68 deletions.
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,9 @@ Removal of prior version deprecations/changes
- Enforced disallowing a string column label into ``times`` in :meth:`DataFrame.ewm` (:issue:`43265`)
- Enforced disallowing a tuple of column labels into :meth:`.DataFrameGroupBy.__getitem__` (:issue:`30546`)
- Enforced disallowing setting values with ``.loc`` using a positional slice. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`)
- Enforced disallowing ``dict`` or ``set`` objects in ``suffixes`` in :func:`merge` (:issue:`34810`)
- Enforced disallowing :func:`merge` to produce duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`)
- Enforced disallowing using :func:`merge` or :func:`join` on a different number of levels (:issue:`34862`)
- Removed setting Categorical._codes directly (:issue:`41429`)
- Removed setting Categorical.categories directly (:issue:`47834`)
- Removed argument ``inplace`` from :meth:`Categorical.add_categories`, :meth:`Categorical.remove_categories`, :meth:`Categorical.set_categories`, :meth:`Categorical.rename_categories`, :meth:`Categorical.reorder_categories`, :meth:`Categorical.set_ordered`, :meth:`Categorical.as_ordered`, :meth:`Categorical.as_unordered` (:issue:`37981`, :issue:`41118`, :issue:`41133`, :issue:`47834`)
Expand Down
29 changes: 11 additions & 18 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,16 +676,14 @@ def __init__(
f"right_index parameter must be of type bool, not {type(right_index)}"
)

# warn user when merging between different levels
# GH 40993: raise when merging between different levels; enforced in 2.0
if _left.columns.nlevels != _right.columns.nlevels:
msg = (
"merging between different levels is deprecated and will be removed "
f"in a future version. ({_left.columns.nlevels} levels on the left, "
"Not allowed to merge between different levels. "
f"({_left.columns.nlevels} levels on the left, "
f"{_right.columns.nlevels} on the right)"
)
# stacklevel chosen to be correct when this is reached via pd.merge
# (and not DataFrame.join)
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
raise MergeError(msg)

self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)

Expand Down Expand Up @@ -2475,13 +2473,10 @@ def _items_overlap_with_suffix(
If corresponding suffix is empty, the entry is simply converted to string.
"""
if not is_list_like(suffixes, allow_sets=False):
warnings.warn(
f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give "
"unexpected results. Provide 'suffixes' as a tuple instead. In the "
"future a 'TypeError' will be raised.",
FutureWarning,
stacklevel=find_stack_level(),
if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict):
raise TypeError(
f"Passing 'suffixes' as a {type(suffixes)}, is not supported. "
"Provide 'suffixes' as a tuple instead."
)

to_rename = left.intersection(right)
Expand Down Expand Up @@ -2527,11 +2522,9 @@ def renamer(x, suffix):
if not rlabels.is_unique:
dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())
if dups:
warnings.warn(
f"Passing 'suffixes' which cause duplicate columns {set(dups)} in the "
f"result is deprecated and will raise a MergeError in a future version.",
FutureWarning,
stacklevel=find_stack_level(),
raise MergeError(
f"Passing 'suffixes' which cause duplicate columns {set(dups)} is "
f"not allowed.",
)

return llabels, rlabels
23 changes: 10 additions & 13 deletions pandas/tests/frame/methods/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,8 +516,9 @@ def test_join_multiindex_dates(self):

tm.assert_equal(result, expected)

def test_merge_join_different_levels(self):
def test_merge_join_different_levels_raises(self):
# GH#9455
# GH 40993: For raising, enforced in 2.0

# first dataframe
df1 = DataFrame(columns=["a", "b"], data=[[1, 11], [0, 22]])
Expand All @@ -527,20 +528,16 @@ def test_merge_join_different_levels(self):
df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]])

# merge
columns = ["a", "b", ("c", "c1")]
expected = DataFrame(columns=columns, data=[[1, 11, 33], [0, 22, 44]])
with tm.assert_produces_warning(FutureWarning):
result = pd.merge(df1, df2, on="a")
tm.assert_frame_equal(result, expected)
with pytest.raises(
MergeError, match="Not allowed to merge between different levels"
):
pd.merge(df1, df2, on="a")

# join, see discussion in GH#12219
columns = ["a", "b", ("a", ""), ("c", "c1")]
expected = DataFrame(columns=columns, data=[[1, 11, 0, 44], [0, 22, 1, 33]])
msg = "merging between different levels is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
# stacklevel is chosen to be correct for pd.merge, not DataFrame.join
result = df1.join(df2, on="a")
tm.assert_frame_equal(result, expected)
with pytest.raises(
MergeError, match="Not allowed to merge between different levels"
):
df1.join(df2, on="a")

def test_frame_join_tzaware(self):
test1 = DataFrame(
Expand Down
23 changes: 12 additions & 11 deletions pandas/tests/reshape/merge/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,18 +420,18 @@ def test_join_inner_multiindex(self, lexsorted_two_level_string_multiindex):

# _assert_same_contents(expected, expected2.loc[:, expected.columns])

def test_join_hierarchical_mixed(self):
def test_join_hierarchical_mixed_raises(self):
# GH 2024
# GH 40993: For raising, enforced in 2.0
df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"])
new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]})
other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"])
other_df.set_index("a", inplace=True)
# GH 9455, 12219
msg = "merging between different levels is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = merge(new_df, other_df, left_index=True, right_index=True)
assert ("b", "mean") in result
assert "b" in result
with pytest.raises(
pd.errors.MergeError, match="Not allowed to merge between different levels"
):
merge(new_df, other_df, left_index=True, right_index=True)

def test_join_float64_float32(self):

Expand Down Expand Up @@ -642,11 +642,12 @@ def test_join_dups(self):
dta = x.merge(y, left_index=True, right_index=True).merge(
z, left_index=True, right_index=True, how="outer"
)
with tm.assert_produces_warning(FutureWarning):
dta = dta.merge(w, left_index=True, right_index=True)
expected = concat([x, y, z, w], axis=1)
expected.columns = ["x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"]
tm.assert_frame_equal(dta, expected)
# GH 40991: As of 2.0 causes duplicate columns
with pytest.raises(
pd.errors.MergeError,
match="Passing 'suffixes' which cause duplicate columns",
):
dta.merge(w, left_index=True, right_index=True)

def test_join_multi_to_multi(self, join_type):
# GH 20475
Expand Down
44 changes: 18 additions & 26 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2207,6 +2207,7 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm):

def test_merge_series_multilevel():
# GH#47946
# GH 40993: For raising, enforced in 2.0
a = DataFrame(
{"A": [1, 2, 3, 4]},
index=MultiIndex.from_product([["a", "b"], [0, 1]], names=["outer", "inner"]),
Expand All @@ -2216,13 +2217,10 @@ def test_merge_series_multilevel():
index=MultiIndex.from_product([["a", "b"], [1, 2]], names=["outer", "inner"]),
name=("B", "C"),
)
expected = DataFrame(
{"A": [2, 4], ("B", "C"): [1, 3]},
index=MultiIndex.from_product([["a", "b"], [1]], names=["outer", "inner"]),
)
with tm.assert_produces_warning(FutureWarning):
result = merge(a, b, on=["outer", "inner"])
tm.assert_frame_equal(result, expected)
with pytest.raises(
MergeError, match="Not allowed to merge between different levels"
):
merge(a, b, on=["outer", "inner"])


@pytest.mark.parametrize(
Expand Down Expand Up @@ -2303,12 +2301,12 @@ def test_merge_suffix_error(col1, col2, suffixes):


@pytest.mark.parametrize("suffixes", [{"left", "right"}, {"left": 0, "right": 0}])
def test_merge_suffix_warns(suffixes):
def test_merge_suffix_raises(suffixes):
a = DataFrame({"a": [1, 2, 3]})
b = DataFrame({"b": [3, 4, 5]})

with tm.assert_produces_warning(FutureWarning):
merge(a, b, left_index=True, right_index=True, suffixes={"left", "right"})
with pytest.raises(TypeError, match="Passing 'suffixes' as a"):
merge(a, b, left_index=True, right_index=True, suffixes=suffixes)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -2609,20 +2607,16 @@ def test_merge_result_empty_index_and_on():
tm.assert_frame_equal(result, expected)


def test_merge_suffixes_produce_dup_columns_warns():
# GH#22818
def test_merge_suffixes_produce_dup_columns_raises():
# GH#22818; Enforced in 2.0
left = DataFrame({"a": [1, 2, 3], "b": 1, "b_x": 2})
right = DataFrame({"a": [1, 2, 3], "b": 2})
expected = DataFrame(
[[1, 1, 2, 2], [2, 1, 2, 2], [3, 1, 2, 2]], columns=["a", "b_x", "b_x", "b_y"]
)
with tm.assert_produces_warning(FutureWarning):
result = merge(left, right, on="a")
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(FutureWarning):
with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"):
merge(left, right, on="a")

with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"):
merge(right, left, on="a", suffixes=("_y", "_x"))
tm.assert_frame_equal(result, expected)


def test_merge_duplicate_columns_with_suffix_no_warning():
Expand All @@ -2635,15 +2629,13 @@ def test_merge_duplicate_columns_with_suffix_no_warning():
tm.assert_frame_equal(result, expected)


def test_merge_duplicate_columns_with_suffix_causing_another_duplicate():
# GH#22818
def test_merge_duplicate_columns_with_suffix_causing_another_duplicate_raises():
# GH#22818, Enforced in 2.0
# This should raise warning because suffixes cause another collision
left = DataFrame([[1, 1, 1, 1], [2, 2, 2, 2]], columns=["a", "b", "b", "b_x"])
right = DataFrame({"a": [1, 3], "b": 2})
with tm.assert_produces_warning(FutureWarning):
result = merge(left, right, on="a")
expected = DataFrame([[1, 1, 1, 1, 2]], columns=["a", "b_x", "b_x", "b_x", "b_y"])
tm.assert_frame_equal(result, expected)
with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"):
merge(left, right, on="a")


def test_merge_string_float_column_result():
Expand Down

0 comments on commit 713b209

Please sign in to comment.