From f7c35d56cdfb7af842b54255029b7481ca9b6d94 Mon Sep 17 00:00:00 2001 From: martinfalisse <45781926+martinfalisse@users.noreply.github.com> Date: Thu, 14 Apr 2022 20:27:51 +0200 Subject: [PATCH] Add support for numeric_only in DataFrame._reduce (#10629) Add support for numeric_only in DataFrame._reduce, this way can use df.mean(numeric_only=True), etc. Resolves https://github.com/rapidsai/cudf/issues/2067. Also partially addresses https://github.com/rapidsai/cudf/issues/9009. Authors: - https://github.com/martinfalisse Approvers: - Michael Wang (https://github.com/isVoid) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10629 --- python/cudf/cudf/core/dataframe.py | 25 +++--- python/cudf/cudf/core/single_column_frame.py | 4 +- python/cudf/cudf/tests/test_dataframe.py | 54 +++++++++++++ python/cudf/cudf/tests/test_stats.py | 83 +++++++++++++++++--- 4 files changed, 145 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2b2c09fa2a0..ae60cd91fac 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5180,26 +5180,33 @@ def _reduce( if level is not None: raise NotImplementedError("level parameter is not implemented yet") - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" + source = self + if numeric_only: + numeric_cols = ( + name + for name in self._data.names + if is_numeric_dtype(self._data[name]) ) - axis = self._get_axis_from_axis_arg(axis) + source = self._get_columns_by_label(numeric_cols) + if source.empty: + return Series(index=cudf.StringIndex([])) + + axis = source._get_axis_from_axis_arg(axis) if axis == 0: try: result = [ - getattr(self._data[col], op)(**kwargs) - for col in self._data.names + getattr(source._data[col], op)(**kwargs) + for col in source._data.names ] except AttributeError: - raise TypeError(f"cannot perform {op} with type {self.dtype}") + raise TypeError(f"Not all column dtypes support op {op}") return Series._from_data( - {None: result}, as_index(self._data.names) + {None: result}, as_index(source._data.names) ) elif axis == 1: - return self._apply_cupy_method_axis_1(op, **kwargs) + return source._apply_cupy_method_axis_1(op, **kwargs) @_cudf_nvtx_annotate def _scan( diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 003f8ea7fdb..addc823e7f1 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -52,9 +52,9 @@ def _reduce( if level is not None: raise NotImplementedError("level parameter is not implemented yet") - if numeric_only not in (None, True): + if numeric_only: raise NotImplementedError( - "numeric_only parameter is not implemented yet" + f"Series.{op} does not implement numeric_only" ) try: return getattr(self._column, op)(**kwargs) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index a7fad792bd0..13ab0b35822 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9230,3 +9230,57 @@ def test_dataframe_pct_change(data, periods, fill_method): expected = pdf.pct_change(periods=periods, fill_method=fill_method) assert_eq(expected, actual) + + +def test_mean_timeseries(): + gdf = cudf.datasets.timeseries() + pdf = gdf.to_pandas() + + expected = pdf.mean(numeric_only=True) + actual = gdf.mean(numeric_only=True) + + assert_eq(expected, actual) + + with pytest.raises(TypeError): + gdf.mean() + + +@pytest.mark.parametrize( + "data", + [ + { + "a": [1, 2, 3, 4, 5], + "b": ["a", "b", "c", "d", "e"], + "c": [1.0, 2.0, 3.0, 4.0, 5.0], + } + ], +) +def test_std_different_dtypes(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + expected = pdf.std(numeric_only=True) + actual = gdf.std(numeric_only=True) + + assert_eq(expected, actual) + + with pytest.raises(TypeError): + gdf.std() + + +@pytest.mark.parametrize( + "data", + [ + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"], + "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], + } + ], +) +def test_empty_numeric_only(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + expected = pdf.prod(numeric_only=True) + actual = gdf.prod(numeric_only=True) + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 977a01952db..08f662f0ba7 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -239,13 +239,10 @@ def test_misc_quantiles(data, q): cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), cudf.Series([]), cudf.Series([-3]), - randomdata( - nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} - ), ], ) @pytest.mark.parametrize("null_flag", [False, True]) -def test_kurtosis(data, null_flag): +def test_kurtosis_series(data, null_flag): pdata = data.to_pandas() if null_flag and len(data) > 2: @@ -262,8 +259,13 @@ def test_kurtosis(data, null_flag): expected = pdata.kurt() np.testing.assert_array_almost_equal(got, expected) + got = data.kurt(numeric_only=False) + got = got if np.isscalar(got) else got.to_numpy() + expected = pdata.kurt(numeric_only=False) + np.testing.assert_array_almost_equal(got, expected) + with pytest.raises(NotImplementedError): - data.kurt(numeric_only=False) + data.kurt(numeric_only=True) @pytest.mark.parametrize( @@ -280,13 +282,10 @@ def test_kurtosis(data, null_flag): cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), cudf.Series([]), cudf.Series([-3]), - randomdata( - nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} - ), ], ) @pytest.mark.parametrize("null_flag", [False, True]) -def test_skew(data, null_flag): +def test_skew_series(data, null_flag): pdata = data.to_pandas() if null_flag and len(data) > 2: @@ -298,8 +297,13 @@ def test_skew(data, null_flag): got = got if np.isscalar(got) else got.to_numpy() np.testing.assert_array_almost_equal(got, expected) + got = data.skew(numeric_only=False) + expected = pdata.skew(numeric_only=False) + got = got if np.isscalar(got) else got.to_numpy() + np.testing.assert_array_almost_equal(got, expected) + with pytest.raises(NotImplementedError): - data.skew(numeric_only=False) + data.skew(numeric_only=True) @pytest.mark.parametrize("dtype", params_dtypes) @@ -541,3 +545,62 @@ def test_cov_corr_invalid_dtypes(gsr): rfunc_args_and_kwargs=([gsr],), compare_error_message=False, ) + + +@pytest.mark.parametrize( + "data", + [ + randomdata( + nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} + ), + ], +) +@pytest.mark.parametrize("null_flag", [False, True]) +def test_kurtosis_df(data, null_flag): + pdata = data.to_pandas() + + if null_flag and len(data) > 2: + data.iloc[[0, 2]] = None + pdata.iloc[[0, 2]] = None + + got = data.kurtosis() + got = got if np.isscalar(got) else got.to_numpy() + expected = pdata.kurtosis() + np.testing.assert_array_almost_equal(got, expected) + + got = data.kurt() + got = got if np.isscalar(got) else got.to_numpy() + expected = pdata.kurt() + np.testing.assert_array_almost_equal(got, expected) + + got = data.kurt(numeric_only=True) + got = got if np.isscalar(got) else got.to_numpy() + expected = pdata.kurt(numeric_only=True) + np.testing.assert_array_almost_equal(got, expected) + + +@pytest.mark.parametrize( + "data", + [ + randomdata( + nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} + ), + ], +) +@pytest.mark.parametrize("null_flag", [False, True]) +def test_skew_df(data, null_flag): + pdata = data.to_pandas() + + if null_flag and len(data) > 2: + data.iloc[[0, 2]] = None + pdata.iloc[[0, 2]] = None + + got = data.skew() + expected = pdata.skew() + got = got if np.isscalar(got) else got.to_numpy() + np.testing.assert_array_almost_equal(got, expected) + + got = data.skew(numeric_only=True) + expected = pdata.skew(numeric_only=True) + got = got if np.isscalar(got) else got.to_numpy() + np.testing.assert_array_almost_equal(got, expected)