From f7c35d56cdfb7af842b54255029b7481ca9b6d94 Mon Sep 17 00:00:00 2001
From: martinfalisse <45781926+martinfalisse@users.noreply.github.com>
Date: Thu, 14 Apr 2022 20:27:51 +0200
Subject: [PATCH] Add support for numeric_only in DataFrame._reduce (#10629)

Add support for numeric_only in DataFrame._reduce, this way can use df.mean(numeric_only=True), etc. Resolves https://github.com/rapidsai/cudf/issues/2067. Also partially addresses https://github.com/rapidsai/cudf/issues/9009.

Authors:
  - https://github.com/martinfalisse

Approvers:
  - Michael Wang (https://github.com/isVoid)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10629
---
 python/cudf/cudf/core/dataframe.py           | 25 +++---
 python/cudf/cudf/core/single_column_frame.py |  4 +-
 python/cudf/cudf/tests/test_dataframe.py     | 54 +++++++++++++
 python/cudf/cudf/tests/test_stats.py         | 83 +++++++++++++++++---
 4 files changed, 145 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2b2c09fa2a0..ae60cd91fac 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5180,26 +5180,33 @@ def _reduce(
         if level is not None:
             raise NotImplementedError("level parameter is not implemented yet")
 
-        if numeric_only not in (None, True):
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
+        source = self
+        if numeric_only:
+            numeric_cols = (
+                name
+                for name in self._data.names
+                if is_numeric_dtype(self._data[name])
             )
-        axis = self._get_axis_from_axis_arg(axis)
+            source = self._get_columns_by_label(numeric_cols)
+            if source.empty:
+                return Series(index=cudf.StringIndex([]))
+
+        axis = source._get_axis_from_axis_arg(axis)
 
         if axis == 0:
             try:
                 result = [
-                    getattr(self._data[col], op)(**kwargs)
-                    for col in self._data.names
+                    getattr(source._data[col], op)(**kwargs)
+                    for col in source._data.names
                 ]
             except AttributeError:
-                raise TypeError(f"cannot perform {op} with type {self.dtype}")
+                raise TypeError(f"Not all column dtypes support op {op}")
 
             return Series._from_data(
-                {None: result}, as_index(self._data.names)
+                {None: result}, as_index(source._data.names)
             )
         elif axis == 1:
-            return self._apply_cupy_method_axis_1(op, **kwargs)
+            return source._apply_cupy_method_axis_1(op, **kwargs)
 
     @_cudf_nvtx_annotate
     def _scan(
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 003f8ea7fdb..addc823e7f1 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -52,9 +52,9 @@ def _reduce(
         if level is not None:
             raise NotImplementedError("level parameter is not implemented yet")
 
-        if numeric_only not in (None, True):
+        if numeric_only:
             raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
+                f"Series.{op} does not implement numeric_only"
             )
         try:
             return getattr(self._column, op)(**kwargs)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index a7fad792bd0..13ab0b35822 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9230,3 +9230,57 @@ def test_dataframe_pct_change(data, periods, fill_method):
     expected = pdf.pct_change(periods=periods, fill_method=fill_method)
 
     assert_eq(expected, actual)
+
+
+def test_mean_timeseries():
+    gdf = cudf.datasets.timeseries()
+    pdf = gdf.to_pandas()
+
+    expected = pdf.mean(numeric_only=True)
+    actual = gdf.mean(numeric_only=True)
+
+    assert_eq(expected, actual)
+
+    with pytest.raises(TypeError):
+        gdf.mean()
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {
+            "a": [1, 2, 3, 4, 5],
+            "b": ["a", "b", "c", "d", "e"],
+            "c": [1.0, 2.0, 3.0, 4.0, 5.0],
+        }
+    ],
+)
+def test_std_different_dtypes(data):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    expected = pdf.std(numeric_only=True)
+    actual = gdf.std(numeric_only=True)
+
+    assert_eq(expected, actual)
+
+    with pytest.raises(TypeError):
+        gdf.std()
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"],
+            "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"],
+        }
+    ],
+)
+def test_empty_numeric_only(data):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+    expected = pdf.prod(numeric_only=True)
+    actual = gdf.prod(numeric_only=True)
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 977a01952db..08f662f0ba7 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -239,13 +239,10 @@ def test_misc_quantiles(data, q):
         cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]),
         cudf.Series([]),
         cudf.Series([-3]),
-        randomdata(
-            nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}
-        ),
     ],
 )
 @pytest.mark.parametrize("null_flag", [False, True])
-def test_kurtosis(data, null_flag):
+def test_kurtosis_series(data, null_flag):
     pdata = data.to_pandas()
 
     if null_flag and len(data) > 2:
@@ -262,8 +259,13 @@ def test_kurtosis(data, null_flag):
     expected = pdata.kurt()
     np.testing.assert_array_almost_equal(got, expected)
 
+    got = data.kurt(numeric_only=False)
+    got = got if np.isscalar(got) else got.to_numpy()
+    expected = pdata.kurt(numeric_only=False)
+    np.testing.assert_array_almost_equal(got, expected)
+
     with pytest.raises(NotImplementedError):
-        data.kurt(numeric_only=False)
+        data.kurt(numeric_only=True)
 
 
 @pytest.mark.parametrize(
@@ -280,13 +282,10 @@ def test_kurtosis(data, null_flag):
         cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]),
         cudf.Series([]),
         cudf.Series([-3]),
-        randomdata(
-            nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}
-        ),
     ],
 )
 @pytest.mark.parametrize("null_flag", [False, True])
-def test_skew(data, null_flag):
+def test_skew_series(data, null_flag):
     pdata = data.to_pandas()
 
     if null_flag and len(data) > 2:
@@ -298,8 +297,13 @@ def test_skew(data, null_flag):
     got = got if np.isscalar(got) else got.to_numpy()
     np.testing.assert_array_almost_equal(got, expected)
 
+    got = data.skew(numeric_only=False)
+    expected = pdata.skew(numeric_only=False)
+    got = got if np.isscalar(got) else got.to_numpy()
+    np.testing.assert_array_almost_equal(got, expected)
+
     with pytest.raises(NotImplementedError):
-        data.skew(numeric_only=False)
+        data.skew(numeric_only=True)
 
 
 @pytest.mark.parametrize("dtype", params_dtypes)
@@ -541,3 +545,62 @@ def test_cov_corr_invalid_dtypes(gsr):
         rfunc_args_and_kwargs=([gsr],),
         compare_error_message=False,
     )
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        randomdata(
+            nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}
+        ),
+    ],
+)
+@pytest.mark.parametrize("null_flag", [False, True])
+def test_kurtosis_df(data, null_flag):
+    pdata = data.to_pandas()
+
+    if null_flag and len(data) > 2:
+        data.iloc[[0, 2]] = None
+        pdata.iloc[[0, 2]] = None
+
+    got = data.kurtosis()
+    got = got if np.isscalar(got) else got.to_numpy()
+    expected = pdata.kurtosis()
+    np.testing.assert_array_almost_equal(got, expected)
+
+    got = data.kurt()
+    got = got if np.isscalar(got) else got.to_numpy()
+    expected = pdata.kurt()
+    np.testing.assert_array_almost_equal(got, expected)
+
+    got = data.kurt(numeric_only=True)
+    got = got if np.isscalar(got) else got.to_numpy()
+    expected = pdata.kurt(numeric_only=True)
+    np.testing.assert_array_almost_equal(got, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        randomdata(
+            nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}
+        ),
+    ],
+)
+@pytest.mark.parametrize("null_flag", [False, True])
+def test_skew_df(data, null_flag):
+    pdata = data.to_pandas()
+
+    if null_flag and len(data) > 2:
+        data.iloc[[0, 2]] = None
+        pdata.iloc[[0, 2]] = None
+
+    got = data.skew()
+    expected = pdata.skew()
+    got = got if np.isscalar(got) else got.to_numpy()
+    np.testing.assert_array_almost_equal(got, expected)
+
+    got = data.skew(numeric_only=True)
+    expected = pdata.skew(numeric_only=True)
+    got = got if np.isscalar(got) else got.to_numpy()
+    np.testing.assert_array_almost_equal(got, expected)