From 41f6fcbdc1d749e5879a0ab99ee59ee184bbd35e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 21 Apr 2021 00:47:29 +0200 Subject: [PATCH] Deprecate suffixes in merge producing duplicate columns (#40991) --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/reshape/merge.py | 20 ++++++++++++- pandas/tests/reshape/merge/test_join.py | 3 +- pandas/tests/reshape/merge/test_merge.py | 37 ++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 31a34c0532811..a3638466d1265 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -603,6 +603,7 @@ Deprecations - Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`) - Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`) - Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`) +- Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8cee0dd2abb88..5a13506a42011 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2311,4 +2311,22 @@ def renamer(x, suffix): lrenamer = partial(renamer, suffix=lsuffix) rrenamer = partial(renamer, suffix=rsuffix) - return (left._transform_index(lrenamer), right._transform_index(rrenamer)) + llabels = left._transform_index(lrenamer) + rlabels = right._transform_index(rrenamer) + + dups = [] + if not llabels.is_unique: + # Only warn when duplicates are caused because of suffixes, already duplicated + # columns in origin should not warn + dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist() + if not rlabels.is_unique: + dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist()) + if dups: + warnings.warn( + f"Passing 'suffixes' which cause duplicate columns {set(dups)} in the " + f"result is deprecated and will raise a MergeError in a future version.", + FutureWarning, + stacklevel=4, + ) + + return llabels, rlabels diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index fb161e38c7155..166aa3f5e3263 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -629,7 +629,8 @@ def test_join_dups(self): dta = x.merge(y, left_index=True, right_index=True).merge( z, left_index=True, right_index=True, how="outer" ) - dta = dta.merge(w, left_index=True, right_index=True) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = ["x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"] tm.assert_frame_equal(dta, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 9699a0dec4891..1495a34274a94 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2409,3 +2409,40 @@ def test_merge_result_empty_index_and_on(): result = merge(df2, df1, left_index=True, right_on=["b"]) tm.assert_frame_equal(result, expected) + + +def test_merge_suffixes_produce_dup_columns_warns(): + # GH#22818 + left = DataFrame({"a": [1, 2, 3], "b": 1, "b_x": 2}) + right = DataFrame({"a": [1, 2, 3], "b": 2}) + expected = DataFrame( + [[1, 1, 2, 2], [2, 1, 2, 2], [3, 1, 2, 2]], columns=["a", "b_x", "b_x", "b_y"] + ) + with tm.assert_produces_warning(FutureWarning): + result = merge(left, right, on="a") + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning): + merge(right, left, on="a", suffixes=("_y", "_x")) + tm.assert_frame_equal(result, expected) + + +def test_merge_duplicate_columns_with_suffix_no_warning(): + # GH#22818 + # Do not raise warning when duplicates are caused by duplicates in origin + left = DataFrame([[1, 1, 1], [2, 2, 2]], columns=["a", "b", "b"]) + right = DataFrame({"a": [1, 3], "b": 2}) + result = merge(left, right, on="a") + expected = DataFrame([[1, 1, 1, 2]], columns=["a", "b_x", "b_x", "b_y"]) + tm.assert_frame_equal(result, expected) + + +def test_merge_duplicate_columns_with_suffix_causing_another_duplicate(): + # GH#22818 + # This should raise warning because suffixes cause another collision + left = DataFrame([[1, 1, 1, 1], [2, 2, 2, 2]], columns=["a", "b", "b", "b_x"]) + right = DataFrame({"a": [1, 3], "b": 2}) + with tm.assert_produces_warning(FutureWarning): + result = merge(left, right, on="a") + expected = DataFrame([[1, 1, 1, 1, 2]], columns=["a", "b_x", "b_x", "b_x", "b_y"]) + tm.assert_frame_equal(result, expected)