diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 00f2735f76552e..cf8ba84ecd4f8d 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -106,9 +106,8 @@ consider the following ``DataFrame``: .. versionadded:: 0.20 A string passed to ``groupby`` may refer to either a column or an index level. - If a string matches both a column name and an index level name then a warning is - issued and the column takes precedence. This will result in an ambiguity error - in a future version. + If a string matches both a column name and an index level name, a + ``ValueError`` will be raised. .. ipython:: python diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index c18cd643a66487..317b8b88783080 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -523,6 +523,7 @@ Removal of prior version deprecations/changes - :meth:`Series.repeat` has renamed the ``reps`` argument to ``repeats`` (:issue:`14645`) - Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`) - Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`) +- Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`) - .. _whatsnew_0240.performance: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 409a722017e720..052952103e28c3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4393,7 +4393,6 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last'): inplace = validate_bool_kwarg(inplace, 'inplace') axis = self._get_axis_number(axis) - stacklevel = 2 # Number of stack levels from df.sort_values if not isinstance(by, list): by = [by] @@ -4405,8 +4404,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, keys = [] for x in by: - k = self._get_label_or_level_values(x, axis=axis, - stacklevel=stacklevel) + k = self._get_label_or_level_values(x, axis=axis) keys.append(k) indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position) @@ -4415,8 +4413,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, from pandas.core.sorting import nargsort by = by[0] - k = self._get_label_or_level_values(by, axis=axis, - stacklevel=stacklevel) + k = self._get_label_or_level_values(by, axis=axis) if isinstance(ascending, (tuple, list)): ascending = ascending[0] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9bdf34113ccf04..0e5204fcd65245 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1412,14 +1412,12 @@ def _is_label_or_level_reference(self, key, axis=0): return (self._is_level_reference(key, axis=axis) or self._is_label_reference(key, axis=axis)) - def _check_label_or_level_ambiguity(self, key, axis=0, stacklevel=1): + def _check_label_or_level_ambiguity(self, key, axis=0): """ - Check whether `key` matches both a level of the input `axis` and a - label of the other axis and raise a ``FutureWarning`` if this is the - case. + Check whether `key` is ambiguous. - Note: This method will be altered to raise an ambiguity exception in - a future version. + By ambiguous, we mean that it matches both a level of the input + `axis` and a label of the other axis. Parameters ---------- @@ -1427,18 +1425,10 @@ def _check_label_or_level_ambiguity(self, key, axis=0, stacklevel=1): label or level name axis: int, default 0 Axis that levels are associated with (0 for index, 1 for columns) - stacklevel: int, default 1 - Stack level used when a FutureWarning is raised (see below). - - Returns - ------- - ambiguous: bool Raises ------ - FutureWarning - if `key` is ambiguous. This will become an ambiguity error in a - future version + ValueError: `key` is ambiguous """ axis = self._get_axis_number(axis) @@ -1464,21 +1454,15 @@ def _check_label_or_level_ambiguity(self, key, axis=0, stacklevel=1): ('an', 'index')) msg = ("'{key}' is both {level_article} {level_type} level and " - "{label_article} {label_type} label.\n" - "Defaulting to {label_type}, but this will raise an " - "ambiguity error in a future version" + "{label_article} {label_type} label, which is ambiguous." ).format(key=key, level_article=level_article, level_type=level_type, label_article=label_article, label_type=label_type) + raise ValueError(msg) - warnings.warn(msg, FutureWarning, stacklevel=stacklevel + 1) - return True - else: - return False - - def _get_label_or_level_values(self, key, axis=0, stacklevel=1): + def _get_label_or_level_values(self, key, axis=0): """ Return a 1-D array of values associated with `key`, a label or level from the given `axis`. @@ -1497,8 +1481,6 @@ def _get_label_or_level_values(self, key, axis=0, stacklevel=1): Label or level name. axis: int, default 0 Axis that levels are associated with (0 for index, 1 for columns) - stacklevel: int, default 1 - Stack level used when a FutureWarning is raised (see below). Returns ------- @@ -1524,8 +1506,7 @@ def _get_label_or_level_values(self, key, axis=0, stacklevel=1): .format(type=type(self))) if self._is_label_reference(key, axis=axis): - self._check_label_or_level_ambiguity(key, axis=axis, - stacklevel=stacklevel + 1) + self._check_label_or_level_ambiguity(key, axis=axis) values = self.xs(key, axis=other_axes[0])._values elif self._is_level_reference(key, axis=axis): values = self.axes[axis].get_level_values(key)._values diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 35d4a024a4e6c6..e7144fb1d2932c 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -571,9 +571,7 @@ def is_in_obj(gpr): elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: - stacklevel = 5 # Number of stack levels from df.groupby - obj._check_label_or_level_ambiguity( - gpr, stacklevel=stacklevel) + obj._check_label_or_level_ambiguity(gpr) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) elif obj._is_level_reference(gpr): diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3989c70c9d13ff..c4305136accb15 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -811,7 +811,6 @@ def _get_merge_keys(self): left_drop = [] left, right = self.left, self.right - stacklevel = 5 # Number of stack levels from df.merge is_lkey = lambda x: is_array_like(x) and len(x) == len(left) is_rkey = lambda x: is_array_like(x) and len(x) == len(right) @@ -837,8 +836,7 @@ def _get_merge_keys(self): else: if rk is not None: right_keys.append( - right._get_label_or_level_values( - rk, stacklevel=stacklevel)) + right._get_label_or_level_values(rk)) join_names.append(rk) else: # work-around for merge_asof(right_index=True) @@ -848,8 +846,7 @@ def _get_merge_keys(self): if not is_rkey(rk): if rk is not None: right_keys.append( - right._get_label_or_level_values( - rk, stacklevel=stacklevel)) + right._get_label_or_level_values(rk)) else: # work-around for merge_asof(right_index=True) right_keys.append(right.index) @@ -862,8 +859,7 @@ def _get_merge_keys(self): else: right_keys.append(rk) if lk is not None: - left_keys.append(left._get_label_or_level_values( - lk, stacklevel=stacklevel)) + left_keys.append(left._get_label_or_level_values(lk)) join_names.append(lk) else: # work-around for merge_asof(left_index=True) @@ -875,8 +871,7 @@ def _get_merge_keys(self): left_keys.append(k) join_names.append(None) else: - left_keys.append(left._get_label_or_level_values( - k, stacklevel=stacklevel)) + left_keys.append(left._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.right.index, MultiIndex): right_keys = [lev._values.take(lab) @@ -890,8 +885,7 @@ def _get_merge_keys(self): right_keys.append(k) join_names.append(None) else: - right_keys.append(right._get_label_or_level_values( - k, stacklevel=stacklevel)) + right_keys.append(right._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.left.index, MultiIndex): left_keys = [lev._values.take(lab) diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py index 3b4eadfce81cd3..2653cc77b27a4e 100644 --- a/pandas/tests/frame/test_sort_values_level_as_str.py +++ b/pandas/tests/frame/test_sort_values_level_as_str.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, Index +from pandas import DataFrame from pandas.errors import PerformanceWarning from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal @@ -93,34 +93,3 @@ def test_sort_column_level_and_index_label( assert_frame_equal(result, expected) else: assert_frame_equal(result, expected) - - -def test_sort_values_column_index_level_precedence(): - # GH 14353, when a string passed as the `by` parameter - # matches a column and an index level the column takes - # precedence - - # Construct DataFrame with index and column named 'idx' - idx = Index(np.arange(1, 7), name='idx') - df = DataFrame({'A': np.arange(11, 17), - 'idx': np.arange(6, 0, -1)}, - index=idx) - - # Sorting by 'idx' should sort by the idx column and raise a - # FutureWarning - with tm.assert_produces_warning(FutureWarning): - result = df.sort_values(by='idx') - - # This should be equivalent to sorting by the 'idx' index level in - # descending order - expected = df.sort_index(level='idx', ascending=False) - assert_frame_equal(result, expected) - - # Perform same test with MultiIndex - df_multi = df.set_index('A', append=True) - - with tm.assert_produces_warning(FutureWarning): - result = df_multi.sort_values(by='idx') - - expected = df_multi.sort_index(level='idx', ascending=False) - assert_frame_equal(result, expected) diff --git a/pandas/tests/generic/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py index 8e4d28fc796df6..4d78270c856ae0 100644 --- a/pandas/tests/generic/test_label_or_level_utils.py +++ b/pandas/tests/generic/test_label_or_level_utils.py @@ -166,31 +166,24 @@ def test_is_label_or_level_reference_panel_error(panel): def test_check_label_or_level_ambiguity_df(df_ambig, axis): # Transpose frame if axis == 1 - if axis in {1, 'columns'}: + if axis in {1, "columns"}: df_ambig = df_ambig.T - # df_ambig has both an on-axis level and off-axis label named L1 - # Therefore L1 is ambiguous - with tm.assert_produces_warning(FutureWarning, - clear=True) as w: + if axis in {0, "index"}: + msg = "'L1' is both an index level and a column label" + else: + msg = "'L1' is both a column level and an index label" - assert df_ambig._check_label_or_level_ambiguity('L1', axis=axis) - warning_msg = w[0].message.args[0] - if axis in {0, 'index'}: - assert warning_msg.startswith("'L1' is both an index level " - "and a column label") - else: - assert warning_msg.startswith("'L1' is both a column level " - "and an index label") + # df_ambig has both an on-axis level and off-axis label named L1 + # Therefore, L1 is ambiguous. + with tm.assert_raises_regex(ValueError, msg): + df_ambig._check_label_or_level_ambiguity("L1", axis=axis) - # df_ambig has an on-axis level named L2 and it is not ambiguous - # No warning should be raised - with tm.assert_produces_warning(None): - assert not df_ambig._check_label_or_level_ambiguity('L2', axis=axis) + # df_ambig has an on-axis level named L2,, and it is not ambiguous. + df_ambig._check_label_or_level_ambiguity("L2", axis=axis) - # df_ambig has an off-axis label named L3 and it is not ambiguous - with tm.assert_produces_warning(None): - assert not df_ambig._is_level_reference('L3', axis=axis) + # df_ambig has an off-axis label named L3, and it is not ambiguous + assert not df_ambig._check_label_or_level_ambiguity("L3", axis=axis) # Series @@ -200,17 +193,15 @@ def test_check_label_or_level_ambiguity_series(df): # A series has no columns and therefore references are never ambiguous # Make series with L1 as index - s = df.set_index('L1').L2 - with tm.assert_produces_warning(None): - assert not s._check_label_or_level_ambiguity('L1', axis=0) - assert not s._check_label_or_level_ambiguity('L2', axis=0) + s = df.set_index("L1").L2 + s._check_label_or_level_ambiguity("L1", axis=0) + s._check_label_or_level_ambiguity("L2", axis=0) # Make series with L1 and L2 as index - s = df.set_index(['L1', 'L2']).L3 - with tm.assert_produces_warning(None): - assert not s._check_label_or_level_ambiguity('L1', axis=0) - assert not s._check_label_or_level_ambiguity('L2', axis=0) - assert not s._check_label_or_level_ambiguity('L3', axis=0) + s = df.set_index(["L1", "L2"]).L3 + s._check_label_or_level_ambiguity("L1", axis=0) + s._check_label_or_level_ambiguity("L2", axis=0) + s._check_label_or_level_ambiguity("L3", axis=0) def test_check_label_or_level_ambiguity_series_axis1_error(df): @@ -229,7 +220,7 @@ def test_check_label_or_level_ambiguity_panel_error(panel): .format(type=type(panel))) with tm.assert_raises_regex(NotImplementedError, msg): - panel._check_label_or_level_ambiguity('L1', axis=0) + panel._check_label_or_level_ambiguity("L1", axis=0) # Test _get_label_or_level_values @@ -241,19 +232,16 @@ def assert_label_values(frame, labels, axis): else: expected = frame.loc[label]._values - result = frame._get_label_or_level_values(label, axis=axis, - stacklevel=2) + result = frame._get_label_or_level_values(label, axis=axis) assert array_equivalent(expected, result) def assert_level_values(frame, levels, axis): for level in levels: - if axis in {0, 'index'}: + if axis in {0, "index"}: expected = frame.index.get_level_values(level=level)._values else: - expected = (frame.columns - .get_level_values(level=level) - ._values) + expected = frame.columns.get_level_values(level=level)._values result = frame._get_label_or_level_values(level, axis=axis) assert array_equivalent(expected, result) @@ -281,18 +269,11 @@ def test_get_label_or_level_values_df_ambig(df_ambig, axis): if axis in {1, 'columns'}: df_ambig = df_ambig.T - # df has both an on-axis level and off-axis label named L1 - # Therefore L1 is ambiguous but will default to label - with tm.assert_produces_warning(FutureWarning): - assert_label_values(df_ambig, ['L1'], axis=axis) - - # df has an on-axis level named L2 and it is not ambiguous - with tm.assert_produces_warning(None): - assert_level_values(df_ambig, ['L2'], axis=axis) + # df has an on-axis level named L2, and it is not ambiguous. + assert_level_values(df_ambig, ['L2'], axis=axis) - # df has an off-axis label named L3 and it is not ambiguous - with tm.assert_produces_warning(None): - assert_label_values(df_ambig, ['L3'], axis=axis) + # df has an off-axis label named L3, and it is not ambiguous. + assert_label_values(df_ambig, ['L3'], axis=axis) def test_get_label_or_level_values_df_duplabels(df_duplabels, axis): diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index d021396a7acb31..14a09b83e5b7ce 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -568,18 +568,9 @@ def test_as_index(): 'B': [101, 205]}, columns=['cat', 'A', 'B']) - for name in [None, 'X', 'B', 'cat']: + for name in [None, 'X', 'B']: df.index = Index(list("abc"), name=name) - - if name in group_columns and name in df.index.names: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = df.groupby( - group_columns, as_index=False, observed=True).sum() - - else: - result = df.groupby( - group_columns, as_index=False, observed=True).sum() + result = df.groupby(group_columns, as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index 9fe677664049ee..6afa63c31e3b67 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -3,7 +3,6 @@ import numpy as np from pandas.util.testing import assert_frame_equal, assert_series_equal -import pandas.util.testing as tm @pytest.fixture(params=[['inner'], ['inner', 'outer']]) @@ -67,50 +66,3 @@ def test_grouper_index_level_as_string_series(series, levels): # Compute and check result result = series.groupby(levels).mean() assert_series_equal(result, expected) - - -@pytest.mark.parametrize('key_strs,key_groupers,level_groupers', [ - ('inner', # Index name - pd.Grouper(key='inner'), - pd.Grouper(level='inner'), - ), - (['inner'], # List of index name - [pd.Grouper(key='inner')], - [pd.Grouper(level='inner')] - ), - (['B', 'inner'], # Column and index - ['B', pd.Grouper(key='inner')], - ['B', pd.Grouper(level='inner')] - ), - (['inner', 'B'], # Index and column - [pd.Grouper(key='inner'), 'B'], - [pd.Grouper(level='inner'), 'B'])]) -def test_grouper_column_index_level_precedence(frame, - key_strs, - key_groupers, - level_groupers): - - # GH 5677, when a string passed as the `by` parameter - # matches a column and an index level the column takes - # precedence and a FutureWarning is raised - - # Add 'inner' column to frame - # (frame already has an 'inner' index) - frame['inner'] = [1, 1, 1, 1, 1, 1] - - # Performing a groupby with strings should produce warning - with tm.assert_produces_warning(FutureWarning): - result = frame.groupby(key_strs).mean() - - # Grouping with key Grouper should produce the same result and no warning - with tm.assert_produces_warning(False): - expected = frame.groupby(key_groupers).mean() - - assert_frame_equal(result, expected) - - # Grouping with level Grouper should produce a different result but - # still no warning - with tm.assert_produces_warning(False): - not_expected = frame.groupby(level_groupers).mean() - - assert not result.index.equals(not_expected.index) diff --git a/pandas/tests/reshape/merge/test_merge_index_as_string.py b/pandas/tests/reshape/merge/test_merge_index_as_string.py index a27fcf41681e66..12d9483af87614 100644 --- a/pandas/tests/reshape/merge/test_merge_index_as_string.py +++ b/pandas/tests/reshape/merge/test_merge_index_as_string.py @@ -2,7 +2,6 @@ import pytest from pandas import DataFrame -from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal @@ -176,38 +175,3 @@ def test_join_indexes_and_columns_on(df1, df2, left_index, join_type): lsuffix='_x', rsuffix='_y') assert_frame_equal(result, expected, check_like=True) - - -def test_merge_index_column_precedence(df1, df2): - - # Construct left_df with both an index and a column named 'outer'. - # We make this 'outer' column equal to the 'inner' column so that we - # can verify that the correct values are used by the merge operation - left_df = df1.set_index('outer') - left_df['outer'] = left_df['inner'] - - # Construct right_df with an index level named 'outer' - right_df = df2.set_index('outer') - - # Construct expected result. - # The 'outer' column from left_df is chosen and the resulting - # frame has no index levels - expected = (left_df.reset_index(level='outer', drop=True) - .merge(right_df.reset_index(), on=['outer', 'inner'])) - - # Merge left_df and right_df on 'outer' and 'inner' - # 'outer' for left_df should refer to the 'outer' column, not the - # 'outer' index level and a FutureWarning should be raised - with tm.assert_produces_warning(FutureWarning): - result = left_df.merge(right_df, on=['outer', 'inner']) - - # Check results - assert_frame_equal(result, expected) - - # Perform the same using the left_on and right_on parameters - with tm.assert_produces_warning(FutureWarning): - result = left_df.merge(right_df, - left_on=['outer', 'inner'], - right_on=['outer', 'inner']) - - assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index e66758f58b1d4c..1ee48d0120c7d0 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -596,52 +596,6 @@ def _check_output(result, values_col, index=['A', 'B'], totals = table.loc[('All', ''), item] assert totals == self.data[item].mean() - # issue number #8349: pivot_table with margins and dictionary aggfunc - data = [ - {'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2013, - 'MONTH': 12, 'DAYS': 3, 'SALARY': 17}, - {'JOB': 'Employ', 'NAME': - 'Mary', 'YEAR': 2013, 'MONTH': 12, 'DAYS': 5, 'SALARY': 23}, - {'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014, - 'MONTH': 1, 'DAYS': 10, 'SALARY': 100}, - {'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014, - 'MONTH': 1, 'DAYS': 11, 'SALARY': 110}, - {'JOB': 'Employ', 'NAME': 'Mary', 'YEAR': 2014, - 'MONTH': 1, 'DAYS': 15, 'SALARY': 200}, - {'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014, - 'MONTH': 2, 'DAYS': 8, 'SALARY': 80}, - {'JOB': 'Employ', 'NAME': 'Mary', 'YEAR': 2014, - 'MONTH': 2, 'DAYS': 5, 'SALARY': 190}, - ] - - df = DataFrame(data) - - df = df.set_index(['JOB', 'NAME', 'YEAR', 'MONTH'], drop=False, - append=False) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.pivot_table(index=['JOB', 'NAME'], - columns=['YEAR', 'MONTH'], - values=['DAYS', 'SALARY'], - aggfunc={'DAYS': 'mean', 'SALARY': 'sum'}, - margins=True) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = df.pivot_table(index=['JOB', 'NAME'], - columns=['YEAR', 'MONTH'], - values=['DAYS'], - aggfunc='mean', margins=True) - - tm.assert_frame_equal(result['DAYS'], expected['DAYS']) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = df.pivot_table(index=['JOB', 'NAME'], - columns=['YEAR', 'MONTH'], - values=['SALARY'], - aggfunc='sum', margins=True) - - tm.assert_frame_equal(result['SALARY'], expected['SALARY']) - def test_margins_dtype(self): # GH 17013