Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API/DEPR: Change default skipna behaviour + deprecate numeric_only in Categorical.min and max #27929

Merged
merged 15 commits into from
Dec 2, 2019
Merged
22 changes: 22 additions & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,26 @@ The following methods now also correctly output values for unobserved categories
df.groupby(["cat_1", "cat_2"], observed=False)["value"].count()


By default :meth:`Categorical.min` return the min and the max respectively
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

When :class:`Categorical` contains ``np.nan``,
:meth:`Categorical.min` no longer return ``np.nan`` by default (skipna=True).
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

*pandas 0.25.x*

.. code-block:: ipython

In [1]: pd.Categorical([1, 2, np.nan], ordered=True).min()
Out[1]: nan


*pandas 1.0.0*

.. ipython:: python

pd.Categorical([1, 2, np.nan], ordered=True).min()

.. _whatsnew_1000.api_breaking.deps:

Increased minimum versions for dependencies
Expand Down Expand Up @@ -339,6 +359,8 @@ Deprecations
value in ``idx`` of ``idx_val`` and a new value of ``val``, ``idx.set_value(arr, idx_val, val)``
is equivalent to ``arr[idx.get_loc(idx_val)] = val``, which should be used instead (:issue:`28621`).
- :func:`is_extension_type` is deprecated, :func:`is_extension_array_dtype` should be used instead (:issue:`29457`)
- The parameter ``numeric_only`` of :meth:`Categorical.min` and :meth:`Categorical.max` is deprecated and replaced with ``skipna`` (:issue:`25303`)
-


.. _whatsnew_1000.prior_deprecations:
Expand Down
34 changes: 18 additions & 16 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2175,7 +2175,8 @@ def _reduce(self, name, axis=0, **kwargs):
raise TypeError(f"Categorical cannot perform the operation {name}")
return func(**kwargs)

def min(self, numeric_only=None, **kwargs):
@deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")
def min(self, skipna=True, **kwargs):
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
"""
The minimum value of the object.

Expand All @@ -2191,17 +2192,18 @@ def min(self, numeric_only=None, **kwargs):
min : the minimum of this `Categorical`
"""
self.check_for_ordered("min")
if numeric_only:
good = self._codes != -1
pointer = self._codes[good].min(**kwargs)
good = self._codes != -1
if not good.all():
if skipna:
pointer = self._codes[good].min(**kwargs)
else:
return np.nan
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not correct for i8 types, which should be pd.NaT. how to fix this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could check the categories.dtype.na_value if it exists. But since this is the current behaviour, it's not critical to fix in this PR I think.

else:
pointer = self._codes.min(**kwargs)
if pointer == -1:
return np.nan
else:
return self.categories[pointer]
return self.categories[pointer]

def max(self, numeric_only=None, **kwargs):
@deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")
def max(self, skipna=True, **kwargs):
"""
The maximum value of the object.

Expand All @@ -2217,15 +2219,15 @@ def max(self, numeric_only=None, **kwargs):
max : the maximum of this `Categorical`
"""
self.check_for_ordered("max")
if numeric_only:
good = self._codes != -1
pointer = self._codes[good].max(**kwargs)
good = self._codes != -1
if not good.all():
if skipna:
pointer = self._codes[good].max(**kwargs)
else:
return np.nan
else:
pointer = self._codes.max(**kwargs)
if pointer == -1:
return np.nan
else:
return self.categories[pointer]
return self.categories[pointer]

def mode(self, dropna=True):
"""
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3925,9 +3925,7 @@ def _reduce(
self._get_axis_number(axis)

if isinstance(delegate, Categorical):
# TODO deprecate numeric_only argument for Categorical and use
# skipna as well, see GH25303
return delegate._reduce(name, numeric_only=numeric_only, **kwds)
return delegate._reduce(name, skipna=skipna, **kwds)
elif isinstance(delegate, ExtensionArray):
# dispatch to ExtensionArray interface
return delegate._reduce(name, skipna=skipna, **kwds)
Expand Down
46 changes: 29 additions & 17 deletions pandas/tests/arrays/categorical/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,31 +35,43 @@ def test_min_max(self):
assert _min == "d"
assert _max == "a"

@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_with_nan(self, skipna):
# GH 25303
cat = Categorical(
[np.nan, "b", "c", np.nan], categories=["d", "c", "b", "a"], ordered=True
)
_min = cat.min()
_max = cat.max()
assert np.isnan(_min)
assert _max == "b"
_min = cat.min(skipna=skipna)
_max = cat.max(skipna=skipna)

_min = cat.min(numeric_only=True)
assert _min == "c"
_max = cat.max(numeric_only=True)
assert _max == "b"
if skipna is False:
assert np.isnan(_min)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use isna/notna

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

np.isnan is a more strict / correct test in this case, since we are actually returning NaN (and not None, NA or NaT)

assert np.isnan(_max)
else:
assert _min == "c"
assert _max == "b"

cat = Categorical(
[np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True
)
_min = cat.min()
_max = cat.max()
assert np.isnan(_min)
assert _max == 1

_min = cat.min(numeric_only=True)
assert _min == 2
_max = cat.max(numeric_only=True)
assert _max == 1
_min = cat.min(skipna=skipna)
_max = cat.max(skipna=skipna)

if skipna is False:
assert np.isnan(_min)
assert np.isnan(_max)
else:
assert _min == 2
assert _max == 1

@pytest.mark.parametrize("method", ["min", "max"])
def test_deprecate_numeric_only_min_max(self, method):
# GH 25303
cat = Categorical(
[np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True
)
with tm.assert_produces_warning(expected_warning=FutureWarning):
getattr(cat, method)(numeric_only=True)

@pytest.mark.parametrize(
"values,categories,exp_mode",
Expand Down
32 changes: 13 additions & 19 deletions pandas/tests/reductions/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1043,7 +1043,7 @@ def test_min_max(self):
)
_min = cat.min()
_max = cat.max()
assert np.isnan(_min)
assert _min == "c"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why idd this change?

Copy link
Contributor Author

@makbigc makbigc Aug 26, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please refer to the following comment.

assert _max == "b"

cat = Series(
Expand All @@ -1053,30 +1053,24 @@ def test_min_max(self):
)
_min = cat.min()
_max = cat.max()
assert np.isnan(_min)
assert _min == 2
assert _max == 1

def test_min_max_numeric_only(self):
# TODO deprecate numeric_only argument for Categorical and use
# skipna as well, see GH25303
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_skipna(self, skipna):
# GH 25303
cat = Series(
Categorical(["a", "b", np.nan, "a"], categories=["b", "a"], ordered=True)
)
_min = cat.min(skipna=skipna)
_max = cat.max(skipna=skipna)

_min = cat.min()
_max = cat.max()
assert np.isnan(_min)
assert _max == "a"

_min = cat.min(numeric_only=True)
_max = cat.max(numeric_only=True)
assert _min == "b"
assert _max == "a"

_min = cat.min(numeric_only=False)
_max = cat.max(numeric_only=False)
assert np.isnan(_min)
assert _max == "a"
if skipna is True:
assert _min == "b"
assert _max == "a"
else:
assert np.isnan(_min)
assert np.isnan(_max)


class TestSeriesMode:
Expand Down