From 713b209baedc9d31f172228c50db5301986aee55 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 1 Nov 2022 15:39:37 -0700 Subject: [PATCH] DEPR: Enforce disallowed merging scenarios (#49429) * Enforce merge suffixes tuples * Duplicate merge columns * enforce disallowing different levels * Fix test --- doc/source/whatsnew/v2.0.0.rst | 3 ++ pandas/core/reshape/merge.py | 29 ++++++---------- pandas/tests/frame/methods/test_join.py | 23 ++++++------- pandas/tests/reshape/merge/test_join.py | 23 +++++++------ pandas/tests/reshape/merge/test_merge.py | 44 ++++++++++-------------- 5 files changed, 54 insertions(+), 68 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 04e5154ca1a0b..d2bcf8d19651e 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -273,6 +273,9 @@ Removal of prior version deprecations/changes - Enforced disallowing a string column label into ``times`` in :meth:`DataFrame.ewm` (:issue:`43265`) - Enforced disallowing a tuple of column labels into :meth:`.DataFrameGroupBy.__getitem__` (:issue:`30546`) - Enforced disallowing setting values with ``.loc`` using a positional slice. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`) +- Enforced disallowing ``dict`` or ``set`` objects in ``suffixes`` in :func:`merge` (:issue:`34810`) +- Enforced disallowing :func:`merge` to produce duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`) +- Enforced disallowing using :func:`merge` or :func:`join` on a different number of levels (:issue:`34862`) - Removed setting Categorical._codes directly (:issue:`41429`) - Removed setting Categorical.categories directly (:issue:`47834`) - Removed argument ``inplace`` from :meth:`Categorical.add_categories`, :meth:`Categorical.remove_categories`, :meth:`Categorical.set_categories`, :meth:`Categorical.rename_categories`, :meth:`Categorical.reorder_categories`, :meth:`Categorical.set_ordered`, :meth:`Categorical.as_ordered`, :meth:`Categorical.as_unordered` (:issue:`37981`, :issue:`41118`, :issue:`41133`, :issue:`47834`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f198db72460fd..f4332f2c7eb1b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -676,16 +676,14 @@ def __init__( f"right_index parameter must be of type bool, not {type(right_index)}" ) - # warn user when merging between different levels + # GH 40993: raise when merging between different levels; enforced in 2.0 if _left.columns.nlevels != _right.columns.nlevels: msg = ( - "merging between different levels is deprecated and will be removed " - f"in a future version. ({_left.columns.nlevels} levels on the left, " + "Not allowed to merge between different levels. " + f"({_left.columns.nlevels} levels on the left, " f"{_right.columns.nlevels} on the right)" ) - # stacklevel chosen to be correct when this is reached via pd.merge - # (and not DataFrame.join) - warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + raise MergeError(msg) self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on) @@ -2475,13 +2473,10 @@ def _items_overlap_with_suffix( If corresponding suffix is empty, the entry is simply converted to string. """ - if not is_list_like(suffixes, allow_sets=False): - warnings.warn( - f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give " - "unexpected results. Provide 'suffixes' as a tuple instead. In the " - "future a 'TypeError' will be raised.", - FutureWarning, - stacklevel=find_stack_level(), + if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict): + raise TypeError( + f"Passing 'suffixes' as a {type(suffixes)}, is not supported. " + "Provide 'suffixes' as a tuple instead." ) to_rename = left.intersection(right) @@ -2527,11 +2522,9 @@ def renamer(x, suffix): if not rlabels.is_unique: dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist()) if dups: - warnings.warn( - f"Passing 'suffixes' which cause duplicate columns {set(dups)} in the " - f"result is deprecated and will raise a MergeError in a future version.", - FutureWarning, - stacklevel=find_stack_level(), + raise MergeError( + f"Passing 'suffixes' which cause duplicate columns {set(dups)} is " + f"not allowed.", ) return llabels, rlabels diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 7db26f7eb570b..9081f69d5d2bc 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -516,8 +516,9 @@ def test_join_multiindex_dates(self): tm.assert_equal(result, expected) - def test_merge_join_different_levels(self): + def test_merge_join_different_levels_raises(self): # GH#9455 + # GH 40993: For raising, enforced in 2.0 # first dataframe df1 = DataFrame(columns=["a", "b"], data=[[1, 11], [0, 22]]) @@ -527,20 +528,16 @@ def test_merge_join_different_levels(self): df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]]) # merge - columns = ["a", "b", ("c", "c1")] - expected = DataFrame(columns=columns, data=[[1, 11, 33], [0, 22, 44]]) - with tm.assert_produces_warning(FutureWarning): - result = pd.merge(df1, df2, on="a") - tm.assert_frame_equal(result, expected) + with pytest.raises( + MergeError, match="Not allowed to merge between different levels" + ): + pd.merge(df1, df2, on="a") # join, see discussion in GH#12219 - columns = ["a", "b", ("a", ""), ("c", "c1")] - expected = DataFrame(columns=columns, data=[[1, 11, 0, 44], [0, 22, 1, 33]]) - msg = "merging between different levels is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - # stacklevel is chosen to be correct for pd.merge, not DataFrame.join - result = df1.join(df2, on="a") - tm.assert_frame_equal(result, expected) + with pytest.raises( + MergeError, match="Not allowed to merge between different levels" + ): + df1.join(df2, on="a") def test_frame_join_tzaware(self): test1 = DataFrame( diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 23d7c91ceefae..dd2c59ec161e7 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -420,18 +420,18 @@ def test_join_inner_multiindex(self, lexsorted_two_level_string_multiindex): # _assert_same_contents(expected, expected2.loc[:, expected.columns]) - def test_join_hierarchical_mixed(self): + def test_join_hierarchical_mixed_raises(self): # GH 2024 + # GH 40993: For raising, enforced in 2.0 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"]) new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]}) other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"]) other_df.set_index("a", inplace=True) # GH 9455, 12219 - msg = "merging between different levels is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = merge(new_df, other_df, left_index=True, right_index=True) - assert ("b", "mean") in result - assert "b" in result + with pytest.raises( + pd.errors.MergeError, match="Not allowed to merge between different levels" + ): + merge(new_df, other_df, left_index=True, right_index=True) def test_join_float64_float32(self): @@ -642,11 +642,12 @@ def test_join_dups(self): dta = x.merge(y, left_index=True, right_index=True).merge( z, left_index=True, right_index=True, how="outer" ) - with tm.assert_produces_warning(FutureWarning): - dta = dta.merge(w, left_index=True, right_index=True) - expected = concat([x, y, z, w], axis=1) - expected.columns = ["x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"] - tm.assert_frame_equal(dta, expected) + # GH 40991: As of 2.0 causes duplicate columns + with pytest.raises( + pd.errors.MergeError, + match="Passing 'suffixes' which cause duplicate columns", + ): + dta.merge(w, left_index=True, right_index=True) def test_join_multi_to_multi(self, join_type): # GH 20475 diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index edfae3ad9dac6..780e3003d50d7 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2207,6 +2207,7 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm): def test_merge_series_multilevel(): # GH#47946 + # GH 40993: For raising, enforced in 2.0 a = DataFrame( {"A": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], [0, 1]], names=["outer", "inner"]), @@ -2216,13 +2217,10 @@ def test_merge_series_multilevel(): index=MultiIndex.from_product([["a", "b"], [1, 2]], names=["outer", "inner"]), name=("B", "C"), ) - expected = DataFrame( - {"A": [2, 4], ("B", "C"): [1, 3]}, - index=MultiIndex.from_product([["a", "b"], [1]], names=["outer", "inner"]), - ) - with tm.assert_produces_warning(FutureWarning): - result = merge(a, b, on=["outer", "inner"]) - tm.assert_frame_equal(result, expected) + with pytest.raises( + MergeError, match="Not allowed to merge between different levels" + ): + merge(a, b, on=["outer", "inner"]) @pytest.mark.parametrize( @@ -2303,12 +2301,12 @@ def test_merge_suffix_error(col1, col2, suffixes): @pytest.mark.parametrize("suffixes", [{"left", "right"}, {"left": 0, "right": 0}]) -def test_merge_suffix_warns(suffixes): +def test_merge_suffix_raises(suffixes): a = DataFrame({"a": [1, 2, 3]}) b = DataFrame({"b": [3, 4, 5]}) - with tm.assert_produces_warning(FutureWarning): - merge(a, b, left_index=True, right_index=True, suffixes={"left", "right"}) + with pytest.raises(TypeError, match="Passing 'suffixes' as a"): + merge(a, b, left_index=True, right_index=True, suffixes=suffixes) @pytest.mark.parametrize( @@ -2609,20 +2607,16 @@ def test_merge_result_empty_index_and_on(): tm.assert_frame_equal(result, expected) -def test_merge_suffixes_produce_dup_columns_warns(): - # GH#22818 +def test_merge_suffixes_produce_dup_columns_raises(): + # GH#22818; Enforced in 2.0 left = DataFrame({"a": [1, 2, 3], "b": 1, "b_x": 2}) right = DataFrame({"a": [1, 2, 3], "b": 2}) - expected = DataFrame( - [[1, 1, 2, 2], [2, 1, 2, 2], [3, 1, 2, 2]], columns=["a", "b_x", "b_x", "b_y"] - ) - with tm.assert_produces_warning(FutureWarning): - result = merge(left, right, on="a") - tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): + with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"): + merge(left, right, on="a") + + with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"): merge(right, left, on="a", suffixes=("_y", "_x")) - tm.assert_frame_equal(result, expected) def test_merge_duplicate_columns_with_suffix_no_warning(): @@ -2635,15 +2629,13 @@ def test_merge_duplicate_columns_with_suffix_no_warning(): tm.assert_frame_equal(result, expected) -def test_merge_duplicate_columns_with_suffix_causing_another_duplicate(): - # GH#22818 +def test_merge_duplicate_columns_with_suffix_causing_another_duplicate_raises(): + # GH#22818, Enforced in 2.0 # This should raise warning because suffixes cause another collision left = DataFrame([[1, 1, 1, 1], [2, 2, 2, 2]], columns=["a", "b", "b", "b_x"]) right = DataFrame({"a": [1, 3], "b": 2}) - with tm.assert_produces_warning(FutureWarning): - result = merge(left, right, on="a") - expected = DataFrame([[1, 1, 1, 1, 2]], columns=["a", "b_x", "b_x", "b_x", "b_y"]) - tm.assert_frame_equal(result, expected) + with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"): + merge(left, right, on="a") def test_merge_string_float_column_result():