Skip to content

Commit

Permalink
Deprecate suffixes in merge producing duplicate columns (pandas-dev#4…
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored and yeshsurya committed Apr 21, 2021
1 parent 6dd8ac6 commit 41f6fcb
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 2 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,7 @@ Deprecations
- Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`)
- Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`)
- Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`)
- Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`)

.. ---------------------------------------------------------------------------
Expand Down
20 changes: 19 additions & 1 deletion pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2311,4 +2311,22 @@ def renamer(x, suffix):
lrenamer = partial(renamer, suffix=lsuffix)
rrenamer = partial(renamer, suffix=rsuffix)

return (left._transform_index(lrenamer), right._transform_index(rrenamer))
llabels = left._transform_index(lrenamer)
rlabels = right._transform_index(rrenamer)

dups = []
if not llabels.is_unique:
# Only warn when duplicates are caused because of suffixes, already duplicated
# columns in origin should not warn
dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()
if not rlabels.is_unique:
dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())
if dups:
warnings.warn(
f"Passing 'suffixes' which cause duplicate columns {set(dups)} in the "
f"result is deprecated and will raise a MergeError in a future version.",
FutureWarning,
stacklevel=4,
)

return llabels, rlabels
3 changes: 2 additions & 1 deletion pandas/tests/reshape/merge/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,8 @@ def test_join_dups(self):
dta = x.merge(y, left_index=True, right_index=True).merge(
z, left_index=True, right_index=True, how="outer"
)
dta = dta.merge(w, left_index=True, right_index=True)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
dta = dta.merge(w, left_index=True, right_index=True)
expected = concat([x, y, z, w], axis=1)
expected.columns = ["x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"]
tm.assert_frame_equal(dta, expected)
Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2409,3 +2409,40 @@ def test_merge_result_empty_index_and_on():

result = merge(df2, df1, left_index=True, right_on=["b"])
tm.assert_frame_equal(result, expected)


def test_merge_suffixes_produce_dup_columns_warns():
# GH#22818
left = DataFrame({"a": [1, 2, 3], "b": 1, "b_x": 2})
right = DataFrame({"a": [1, 2, 3], "b": 2})
expected = DataFrame(
[[1, 1, 2, 2], [2, 1, 2, 2], [3, 1, 2, 2]], columns=["a", "b_x", "b_x", "b_y"]
)
with tm.assert_produces_warning(FutureWarning):
result = merge(left, right, on="a")
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(FutureWarning):
merge(right, left, on="a", suffixes=("_y", "_x"))
tm.assert_frame_equal(result, expected)


def test_merge_duplicate_columns_with_suffix_no_warning():
# GH#22818
# Do not raise warning when duplicates are caused by duplicates in origin
left = DataFrame([[1, 1, 1], [2, 2, 2]], columns=["a", "b", "b"])
right = DataFrame({"a": [1, 3], "b": 2})
result = merge(left, right, on="a")
expected = DataFrame([[1, 1, 1, 2]], columns=["a", "b_x", "b_x", "b_y"])
tm.assert_frame_equal(result, expected)


def test_merge_duplicate_columns_with_suffix_causing_another_duplicate():
# GH#22818
# This should raise warning because suffixes cause another collision
left = DataFrame([[1, 1, 1, 1], [2, 2, 2, 2]], columns=["a", "b", "b", "b_x"])
right = DataFrame({"a": [1, 3], "b": 2})
with tm.assert_produces_warning(FutureWarning):
result = merge(left, right, on="a")
expected = DataFrame([[1, 1, 1, 1, 2]], columns=["a", "b_x", "b_x", "b_x", "b_y"])
tm.assert_frame_equal(result, expected)

0 comments on commit 41f6fcb

Please sign in to comment.