From 87835fa83c585e561250ac3ff49cb87b9515527b Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 31 Oct 2020 22:33:39 +0100 Subject: [PATCH 1/5] Improve error reporting in merge --- pandas/core/reshape/merge.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5012be593820e..8b5f1cda95401 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1200,11 +1200,9 @@ def _validate_specification(self): if self.left_index and self.right_index: self.left_on, self.right_on = (), () elif self.left_index: - if self.right_on is None: - raise MergeError("Must pass right_on or right_index=True") + raise MergeError("Must pass right_on or right_index=True") elif self.right_index: - if self.left_on is None: - raise MergeError("Must pass left_on or left_index=True") + raise MergeError("Must pass left_on or left_index=True") else: # use the common columns common_cols = self.left.columns.intersection(self.right.columns) @@ -1225,8 +1223,19 @@ def _validate_specification(self): 'Can only pass argument "on" OR "left_on" ' 'and "right_on", not a combination of both.' ) + if self.left_index or self.right_index: + raise MergeError( + 'Can only pass argument "on" OR "left_index" ' + 'and "right_index", not a combination of both.' + ) self.left_on = self.right_on = self.on elif self.left_on is not None: + if self.left_index: + raise MergeError( + 'Can only pass argument "left_on" OR "left_index" not both.' + ) + if not self.right_index or self.right_on is None: + raise MergeError('Must pass "right_on" or "right_index".') n = len(self.left_on) if self.right_index: if len(self.left_on) != self.right.index.nlevels: @@ -1236,6 +1245,12 @@ def _validate_specification(self): ) self.right_on = [None] * n elif self.right_on is not None: + if self.right_index: + raise MergeError( + 'Can only pass argument "right_on" OR "right_index" not both.' + ) + if not self.left_index or self.left_on is None: + raise MergeError('Must pass "left_on" or "left_index".') n = len(self.right_on) if self.left_index: if len(self.right_on) != self.left.index.nlevels: From 490062b42a7806d3f5a882bd3ba5176c552af7c9 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 31 Oct 2020 22:54:59 +0100 Subject: [PATCH 2/5] Add tests --- pandas/core/reshape/merge.py | 4 +- pandas/tests/reshape/merge/test_merge.py | 54 ++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8b5f1cda95401..658b288963ced 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1235,7 +1235,7 @@ def _validate_specification(self): 'Can only pass argument "left_on" OR "left_index" not both.' ) if not self.right_index or self.right_on is None: - raise MergeError('Must pass "right_on" or "right_index".') + raise MergeError('Must pass "right_on" OR "right_index".') n = len(self.left_on) if self.right_index: if len(self.left_on) != self.right.index.nlevels: @@ -1250,7 +1250,7 @@ def _validate_specification(self): 'Can only pass argument "right_on" OR "right_index" not both.' ) if not self.left_index or self.left_on is None: - raise MergeError('Must pass "left_on" or "left_index".') + raise MergeError('Must pass "left_on" OR "left_index".') n = len(self.right_on) if self.left_index: if len(self.right_on) != self.left.index.nlevels: diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index bb2860b88b288..b7afb43cef27e 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2283,3 +2283,57 @@ def test_merge_join_categorical_multiindex(): expected = expected.drop(["Cat", "Int"], axis=1) result = a.join(b, on=["Cat1", "Int1"]) tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("func", ["merge", "merge_asof"]) +@pytest.mark.parametrize( + ("kwargs", "err_msg"), + [ + ({"left_on": "a", "left_index": True}, ["left_on", "left_index"]), + ({"right_on": "a", "right_index": True}, ["right_on", "right_index"]), + ], +) +def test_merge_join_cols_error_reporting_duplicates(func, kwargs, err_msg): + # GH: 16228 + left = DataFrame({"a": [1, 2], "b": [3, 4]}) + right = DataFrame({"a": [1, 1], "c": [5, 6]}) + msg = rf"Can only pass argument \"{err_msg[0]}\" OR \"{err_msg[1]}\" not both\." + with pytest.raises(MergeError, match=msg): + getattr(pd, func)(left, right, **kwargs) + + +@pytest.mark.parametrize("func", ["merge", "merge_asof"]) +@pytest.mark.parametrize( + ("kwargs", "err_msg"), + [ + ({"left_on": "a"}, ["right_on", "right_index"]), + ({"right_on": "a"}, ["left_on", "left_index"]), + ], +) +def test_merge_join_cols_error_reporting_missing(func, kwargs, err_msg): + # GH: 16228 + left = DataFrame({"a": [1, 2], "b": [3, 4]}) + right = DataFrame({"a": [1, 1], "c": [5, 6]}) + msg = rf"Must pass \"{err_msg[0]}\" OR \"{err_msg[1]}\"\." + with pytest.raises(MergeError, match=msg): + getattr(pd, func)(left, right, **kwargs) + + +@pytest.mark.parametrize("func", ["merge", "merge_asof"]) +@pytest.mark.parametrize( + "kwargs", + [ + {"right_index": True}, + {"left_index": True}, + ], +) +def test_merge_join_cols_error_reporting_on_and_index(func, kwargs): + # GH: 16228 + left = DataFrame({"a": [1, 2], "b": [3, 4]}) + right = DataFrame({"a": [1, 1], "c": [5, 6]}) + msg = ( + r"Can only pass argument \"on\" OR \"left_index\" " + r"and \"right_index\", not a combination of both\." + ) + with pytest.raises(MergeError, match=msg): + getattr(pd, func)(left, right, on="a", **kwargs) From 8f787d32b427c6f4052721350d81793f46a4cdf3 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 31 Oct 2020 22:56:34 +0100 Subject: [PATCH 3/5] Add whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7c850ffedfcab..174412b95d6ed 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -228,6 +228,7 @@ Other enhancements - :class:`Rolling` now supports the ``closed`` argument for fixed windows (:issue:`34315`) - :class:`DatetimeIndex` and :class:`Series` with ``datetime64`` or ``datetime64tz`` dtypes now support ``std`` (:issue:`37436`) - :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`) +- Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`) .. _whatsnew_120.api_breaking.python: From e865ae749e268507b0d499635a210f212fd2e98c Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 31 Oct 2020 23:02:47 +0100 Subject: [PATCH 4/5] Fix raising condition --- pandas/core/reshape/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 658b288963ced..9d3773df27964 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1234,7 +1234,7 @@ def _validate_specification(self): raise MergeError( 'Can only pass argument "left_on" OR "left_index" not both.' ) - if not self.right_index or self.right_on is None: + if not self.right_index and self.right_on is None: raise MergeError('Must pass "right_on" OR "right_index".') n = len(self.left_on) if self.right_index: @@ -1249,7 +1249,7 @@ def _validate_specification(self): raise MergeError( 'Can only pass argument "right_on" OR "right_index" not both.' ) - if not self.left_index or self.left_on is None: + if not self.left_index and self.left_on is None: raise MergeError('Must pass "left_on" OR "left_index".') n = len(self.right_on) if self.left_index: From 3981fb6de36a2c0e591d0d2ba5f252a028089385 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 6 Nov 2020 21:55:28 +0100 Subject: [PATCH 5/5] Adress review comments --- pandas/tests/reshape/merge/test_merge.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index b7afb43cef27e..3d35a35b97354 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2297,7 +2297,7 @@ def test_merge_join_cols_error_reporting_duplicates(func, kwargs, err_msg): # GH: 16228 left = DataFrame({"a": [1, 2], "b": [3, 4]}) right = DataFrame({"a": [1, 1], "c": [5, 6]}) - msg = rf"Can only pass argument \"{err_msg[0]}\" OR \"{err_msg[1]}\" not both\." + msg = rf'Can only pass argument "{err_msg[0]}" OR "{err_msg[1]}" not both\.' with pytest.raises(MergeError, match=msg): getattr(pd, func)(left, right, **kwargs) @@ -2314,7 +2314,7 @@ def test_merge_join_cols_error_reporting_missing(func, kwargs, err_msg): # GH: 16228 left = DataFrame({"a": [1, 2], "b": [3, 4]}) right = DataFrame({"a": [1, 1], "c": [5, 6]}) - msg = rf"Must pass \"{err_msg[0]}\" OR \"{err_msg[1]}\"\." + msg = rf'Must pass "{err_msg[0]}" OR "{err_msg[1]}"\.' with pytest.raises(MergeError, match=msg): getattr(pd, func)(left, right, **kwargs) @@ -2332,8 +2332,8 @@ def test_merge_join_cols_error_reporting_on_and_index(func, kwargs): left = DataFrame({"a": [1, 2], "b": [3, 4]}) right = DataFrame({"a": [1, 1], "c": [5, 6]}) msg = ( - r"Can only pass argument \"on\" OR \"left_index\" " - r"and \"right_index\", not a combination of both\." + r'Can only pass argument "on" OR "left_index" ' + r'and "right_index", not a combination of both\.' ) with pytest.raises(MergeError, match=msg): getattr(pd, func)(left, right, on="a", **kwargs)