diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index f491774991090..bb315987b21d9 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -59,6 +59,10 @@ Previously indexing with a nullable Boolean array containing ``NA`` would raise Bug fixes ~~~~~~~~~ +**ExtensionArray** + +- Fixed issue where taking ``min`` or ``max`` of a ``StringArray`` or ``Series`` with ``StringDtype`` type would raise. (:issue:`31746`) + **Datetimelike** - Bug in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when reindexing with a tz-aware index (:issue:`26683`) @@ -74,6 +78,10 @@ Bug fixes - Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`) - Fixed bug in parquet roundtrip with nullable unsigned integer dtypes (:issue:`31896`). +**Reduction** + +- Fixed bug where :meth:`Series.min` and :meth:`Series.max` would raise a ``TypeError`` in the presence of ``NaN`` for object dtype. (:issue:`18588`) + **Experimental dtypes** - Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`). diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index fcccd8cc14d6b..7df4b6d297cb9 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -4,6 +4,7 @@ import numpy as np from pandas._libs import lib, missing as libmissing +from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import pandas_dtype @@ -12,7 +13,7 @@ from pandas.core.dtypes.inference import is_array_like from pandas import compat -from pandas.core import ops +from pandas.core import nanops, ops from pandas.core.arrays import PandasArray from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer @@ -274,8 +275,21 @@ def astype(self, dtype, copy=True): return super().astype(dtype, copy) def _reduce(self, name, skipna=True, **kwargs): + if name in ["min", "max"]: + return getattr(self, name)(skipna=skipna, **kwargs) + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def min(self, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_min((), dict(out=out, keepdims=keepdims)) + result = nanops.nanmin(self._ndarray, axis=axis, skipna=skipna, mask=isna(self)) + return libmissing.NA if isna(result) else result + + def max(self, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_max((), dict(out=out, keepdims=keepdims)) + result = nanops.nanmax(self._ndarray, axis=axis, skipna=skipna, mask=isna(self)) + return libmissing.NA if isna(result) else result + def value_counts(self, dropna=False): from pandas import value_counts diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a5c609473760d..1582be35861e1 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -228,7 +228,9 @@ def _maybe_get_mask( # Boolean data cannot contain nulls, so signal via mask being None return None - if skipna: + if skipna or is_object_dtype(values.dtype): + # The masking in nanminmax does not work for object dtype, so always + # compute a mask for use later when skipna is False mask = isna(values) return mask @@ -865,6 +867,17 @@ def reduction( result.fill(np.nan) except (AttributeError, TypeError, ValueError): result = np.nan + elif ( + is_object_dtype(dtype) + and values.ndim == 1 + and mask is not None + and mask.any() + ): + # Need to explicitly mask NA values for object dtypes + if skipna: + result = getattr(values[~mask], meth)(axis) + else: + result = np.nan else: result = getattr(values, meth)(axis) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 5e2f14af341ab..cff5278a23b0a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -269,3 +269,31 @@ def test_value_counts_na(): result = arr.value_counts(dropna=True) expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max"]) +@pytest.mark.parametrize("skipna", [True, False]) +def test_reduction(func, skipna): + s = pd.Series(["x", "y", "z"], dtype="string") + result = getattr(s, func)(skipna=skipna) + expected = "x" if func == "min" else "z" + + assert result == expected + + +@pytest.mark.parametrize("func", ["min", "max"]) +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("box_in_series", [True, False]) +def test_reduction_with_na(func, skipna, box_in_series): + data = pd.array([pd.NA, "y", "z"], dtype="string") + + if box_in_series: + data = pd.Series(data) + + result = getattr(data, func)(skipna=skipna) + + if skipna: + expected = "y" if func == "min" else "z" + assert result == expected + else: + assert result is pd.NA diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 86aed671f1b88..5a6522efd21ea 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -77,7 +77,20 @@ class TestMissing(base.BaseMissingTests): class TestNoReduce(base.BaseNoReduceTests): - pass + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): + if isinstance(data, pd.arrays.StringArray) and all_numeric_reductions in [ + "min", + "max", + ]: + # Tested in pandas/tests/arrays/string_/test_string.py + pytest.skip("These reductions are implemented") + + op_name = all_numeric_reductions + s = pd.Series(data) + + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) class TestMethods(base.BaseMethodsTests): diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index fe6abef97acc4..b982ac7d7f2ed 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1406,8 +1406,8 @@ def test_apply_datetime_tz_issue(self): @pytest.mark.parametrize("method", ["min", "max", "sum"]) def test_consistency_of_aggregates_of_columns_with_missing_values(self, df, method): # GH 16832 - none_in_first_column_result = getattr(df[["A", "B"]], method)() - none_in_second_column_result = getattr(df[["B", "A"]], method)() + none_in_first_column_result = getattr(df[["A", "B"]], method)().sort_index() + none_in_second_column_result = getattr(df[["B", "A"]], method)().sort_index() tm.assert_series_equal( none_in_first_column_result, none_in_second_column_result diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 211d0d52d8357..d0bc51ba4718d 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -172,6 +172,19 @@ def test_numpy_reduction_with_tz_aware_dtype(self, tz_aware_fixture, func): result = getattr(np, func)(expected, expected) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("func", ["min", "max"]) + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_object_with_na(self, func, skipna): + # https://github.com/pandas-dev/pandas/issues/18588 + s = pd.Series(np.array(["a", "b", np.nan], dtype=object)) + result = getattr(s, func)(skipna=skipna) + + if skipna: + expected = "a" if func == "min" else "b" + assert result == expected + else: + assert result is np.nan + class TestIndexReductions: # Note: the name TestIndexReductions indicates these tests