From d765dc33c2326778df30bee1c0046db75a837363 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Thu, 13 Dec 2018 13:11:19 -0800 Subject: [PATCH 1/5] BUG-24241 make Categorical.map transform nans --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/arrays/categorical.py | 21 +++++++++++++++--- pandas/tests/indexes/test_category.py | 31 +++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 6095865fde87c..11b61b34dc00d 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1273,6 +1273,7 @@ Categorical - Bug when resampling :meth:`Dataframe.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`) - Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`) - Bug in :meth:`Series.where` losing the categorical dtype for categorical data (:issue:`24077`) +- Bug in :meth:`Categorical.apply` where the given function would not be applied to ``NaN`` values (:issue:`24241`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6ccb8dc5d2725..59ab02d62b44a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1229,11 +1229,26 @@ def map(self, mapper): Index(['first', 'second', nan], dtype='object') """ new_categories = self.categories.map(mapper) + try: - return self.from_codes(self._codes.copy(), - categories=new_categories, - ordered=self.ordered) + if isinstance(mapper, (dict, ABCSeries)): + new_value = mapper[np.nan] + else: + new_value = mapper(np.nan) + except (AttributeError, KeyError, TypeError, ValueError): + new_value = np.nan + + try: + ret = self.from_codes(self._codes.copy(), + categories=new_categories, + ordered=self.ordered) + if new_value not in ret.categories and any(self._codes == -1): + ret.add_categories(new_value, inplace=True) + ret = ret.fillna(new_value) + return ret except ValueError: + new_categories = new_categories.insert(len(new_categories), + new_value) return np.take(new_categories, self._codes) __eq__ = _cat_compare_op('__eq__') diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index bb537f30821e4..477c3de82f004 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -311,6 +311,37 @@ def test_map_with_categorical_series(self): exp = pd.Index(["odd", "even", "odd", np.nan]) tm.assert_index_equal(a.map(c), exp) + @pytest.mark.parametrize('data, f', [[[1, 1, np.nan], pd.isna], + [[1, 2, np.nan], pd.isna], + [[1, 1, np.nan], {1: False, + np.nan: True}], + [[1, 2, np.nan], {1: False, + 2: False, + np.nan: True}]]) + def test_map_fill_nan(self, data, f): + values = pd.Categorical(data) + result = values.map(f) + if data[1] == 1: + expected = pd.Categorical([False, False, True]) + tm.assert_categorical_equal(result, expected) + else: + expected = pd.Index([False, False, True]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize('data, f', [[[1, 1, np.nan], {1: False}], + [[1, 2, np.nan], {1: False, + 2: False}], + [[1, 1, np.nan], pd.Series([False, + False])], + [[1, 2, np.nan], pd.Series([False, + False, + False])]]) + def test_map_dont_fill_nan(self, data, f): + values = pd.Categorical(data) + result = values.map(f) + expected = pd.Index([False, False, np.nan]) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) def test_where(self, klass): i = self.create_index() From 628bfac22b466c4f15a26373e099eb2b05598ad8 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Fri, 14 Dec 2018 18:43:34 -0800 Subject: [PATCH 2/5] BUG-24241 make requested changes --- pandas/core/arrays/categorical.py | 2 +- pandas/tests/indexes/test_category.py | 41 ++++++++++++++++----------- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 59ab02d62b44a..e13a45cbad231 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1231,7 +1231,7 @@ def map(self, mapper): new_categories = self.categories.map(mapper) try: - if isinstance(mapper, (dict, ABCSeries)): + if is_dict_like(mapper): new_value = mapper[np.nan] else: new_value = mapper(np.nan) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 477c3de82f004..f7c82af475b79 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -311,14 +311,18 @@ def test_map_with_categorical_series(self): exp = pd.Index(["odd", "even", "odd", np.nan]) tm.assert_index_equal(a.map(c), exp) - @pytest.mark.parametrize('data, f', [[[1, 1, np.nan], pd.isna], - [[1, 2, np.nan], pd.isna], - [[1, 1, np.nan], {1: False, - np.nan: True}], - [[1, 2, np.nan], {1: False, - 2: False, - np.nan: True}]]) - def test_map_fill_nan(self, data, f): + @pytest.mark.parametrize( + ( + 'data', + 'f' + ), + ( + ([1, 1, np.nan], pd.isna), + ([1, 2, np.nan], pd.isna), + ([1, 1, np.nan], {1: False, np.nan: True}), + ([1, 2, np.nan], {1: False, 2: False, np.nan: True}) + )) + def test_map_fill_nan(self, data, f): # GH 24241 values = pd.Categorical(data) result = values.map(f) if data[1] == 1: @@ -328,15 +332,18 @@ def test_map_fill_nan(self, data, f): expected = pd.Index([False, False, True]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('data, f', [[[1, 1, np.nan], {1: False}], - [[1, 2, np.nan], {1: False, - 2: False}], - [[1, 1, np.nan], pd.Series([False, - False])], - [[1, 2, np.nan], pd.Series([False, - False, - False])]]) - def test_map_dont_fill_nan(self, data, f): + @pytest.mark.parametrize( + ( + 'data', + 'f' + ), + ( + ([1, 1, np.nan], {1: False}), + ([1, 2, np.nan], {1: False, 2: False}), + ([1, 1, np.nan], pd.Series([False, False])), + ([1, 2, np.nan], pd.Series([False, False, False])) + )) + def test_map_dont_fill_nan(self, data, f): # GH 24241 values = pd.Categorical(data) result = values.map(f) expected = pd.Index([False, False, np.nan]) From 20119968d880a88a1f59e11a2ad36f8db5e927cf Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Tue, 18 Dec 2018 20:16:28 -0800 Subject: [PATCH 3/5] BUG-24241 update documentation instead --- doc/source/categorical.rst | 3 ++- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/arrays/categorical.py | 26 +++++++----------------- pandas/tests/indexes/test_category.py | 29 +++++++-------------------- 4 files changed, 17 insertions(+), 43 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 721e032b8bb92..ff37fbbb4aa24 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -1145,7 +1145,8 @@ dtype in apply Pandas currently does not preserve the dtype in apply functions: If you apply along rows you get a `Series` of ``object`` `dtype` (same as getting a row -> getting one element will return a -basic type) and applying along columns will also convert to object. +basic type) and applying along columns will also convert to object. ``NaN`` values are unaffected. +You can use ``fillna`` to handle missing values before applying a function. .. ipython:: python diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 11b61b34dc00d..933d6a486ad07 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1273,7 +1273,7 @@ Categorical - Bug when resampling :meth:`Dataframe.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`) - Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`) - Bug in :meth:`Series.where` losing the categorical dtype for categorical data (:issue:`24077`) -- Bug in :meth:`Categorical.apply` where the given function would not be applied to ``NaN`` values (:issue:`24241`) +- Bug in :meth:`Categorical.apply` where ``NaN`` values could be handled unpredictably. They now remain unchanged (:issue:`24241`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e13a45cbad231..65679753368e8 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1166,7 +1166,7 @@ def map(self, mapper): Maps the categories to new categories. If the mapping correspondence is one-to-one the result is a :class:`~pandas.Categorical` which has the same order property as the original, otherwise a :class:`~pandas.Index` - is returned. + is returned. NaN values are unaffected. If a `dict` or :class:`~pandas.Series` is used any unmapped category is mapped to `NaN`. Note that if this happens an :class:`~pandas.Index` @@ -1229,26 +1229,14 @@ def map(self, mapper): Index(['first', 'second', nan], dtype='object') """ new_categories = self.categories.map(mapper) - try: - if is_dict_like(mapper): - new_value = mapper[np.nan] - else: - new_value = mapper(np.nan) - except (AttributeError, KeyError, TypeError, ValueError): - new_value = np.nan - - try: - ret = self.from_codes(self._codes.copy(), - categories=new_categories, - ordered=self.ordered) - if new_value not in ret.categories and any(self._codes == -1): - ret.add_categories(new_value, inplace=True) - ret = ret.fillna(new_value) - return ret + return self.from_codes(self._codes.copy(), + categories=new_categories, + ordered=self.ordered) except ValueError: - new_categories = new_categories.insert(len(new_categories), - new_value) + if any(self._codes == -1): + new_categories = new_categories.insert(len(new_categories), + np.nan) return np.take(new_categories, self._codes) __eq__ = _cat_compare_op('__eq__') diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index f7c82af475b79..d9dfeadd10b84 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -319,35 +319,20 @@ def test_map_with_categorical_series(self): ( ([1, 1, np.nan], pd.isna), ([1, 2, np.nan], pd.isna), - ([1, 1, np.nan], {1: False, np.nan: True}), - ([1, 2, np.nan], {1: False, 2: False, np.nan: True}) - )) - def test_map_fill_nan(self, data, f): # GH 24241 - values = pd.Categorical(data) - result = values.map(f) - if data[1] == 1: - expected = pd.Categorical([False, False, True]) - tm.assert_categorical_equal(result, expected) - else: - expected = pd.Index([False, False, True]) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize( - ( - 'data', - 'f' - ), - ( ([1, 1, np.nan], {1: False}), ([1, 2, np.nan], {1: False, 2: False}), ([1, 1, np.nan], pd.Series([False, False])), ([1, 2, np.nan], pd.Series([False, False, False])) )) - def test_map_dont_fill_nan(self, data, f): # GH 24241 + def test_map_with_nan(self, data, f): # GH 24241 values = pd.Categorical(data) result = values.map(f) - expected = pd.Index([False, False, np.nan]) - tm.assert_index_equal(result, expected) + if data[1] == 1: + expected = pd.Categorical([False, False, np.nan]) + tm.assert_categorical_equal(result, expected) + else: + expected = pd.Index([False, False, np.nan]) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) def test_where(self, klass): From e5b5415a6a6bbe6f415ebfc8037a1bb56af6abaf Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Wed, 19 Dec 2018 15:36:33 -0800 Subject: [PATCH 4/5] BUG-24241 add comment --- pandas/core/arrays/categorical.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 65679753368e8..163db1ca43415 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1234,6 +1234,8 @@ def map(self, mapper): categories=new_categories, ordered=self.ordered) except ValueError: + # NA values are represented in self._codes with -1 + # np.take causes NA values to take final element in new_categories if any(self._codes == -1): new_categories = new_categories.insert(len(new_categories), np.nan) From 82859d9168b6943b8405a0c830c16e8c3967b713 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Wed, 19 Dec 2018 23:59:11 -0800 Subject: [PATCH 5/5] BUG-24241 use np.any instead of any --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 163db1ca43415..9a8b345cea1b3 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1236,7 +1236,7 @@ def map(self, mapper): except ValueError: # NA values are represented in self._codes with -1 # np.take causes NA values to take final element in new_categories - if any(self._codes == -1): + if np.any(self._codes == -1): new_categories = new_categories.insert(len(new_categories), np.nan) return np.take(new_categories, self._codes)