From fce1586ae4657269946cce2caf51dd45fbe776d0 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 15 Sep 2022 12:49:16 -0400 Subject: [PATCH] Backport PR #48539: REGR: groupby doesn't identify null values when sort=False --- doc/source/whatsnew/v1.5.0.rst | 1 - pandas/core/algorithms.py | 11 ++++++++++ pandas/tests/groupby/test_groupby_dropna.py | 23 ++++++++++++++++++++- pandas/tests/test_algos.py | 12 +++++------ 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 581cc9b5f61a9..bed545fccba1c 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1016,7 +1016,6 @@ Numeric - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`) - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`) - Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`) -- Bug in :func:`factorize` would convert the value ``None`` to ``np.nan`` (:issue:`46601`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2e6737b2e61aa..9b16032a1d418 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -566,6 +566,17 @@ def factorize_array( hash_klass, values = _get_hashtable_algo(values) + # factorize can now handle differentiating various types of null values. + # However, for backwards compatibility we only use the null for the + # provided dtype. This may be revisited in the future, see GH#48476. + null_mask = isna(values) + if null_mask.any(): + na_value = na_value_for_dtype(values.dtype, compat=False) + # Don't modify (potentially user-provided) array + # error: No overload variant of "where" matches argument types "Any", "object", + # "ndarray[Any, Any]" + values = np.where(null_mask, na_value, values) # type: ignore[call-overload] + table = hash_klass(size_hint or len(values)) uniques, codes = table.factorize( values, diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 394b5adcf0370..b2426ffa9dad3 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -3,6 +3,8 @@ from pandas.compat.pyarrow import pa_version_under1p01 +from pandas.core.dtypes.missing import na_value_for_dtype + import pandas as pd import pandas._testing as tm @@ -422,7 +424,7 @@ def test_groupby_drop_nan_with_multi_index(): ( [ pd.Period("2012-02-01", freq="D"), - pd.NA, + pd.NaT, pd.Period("2012-01-01", freq="D"), pd.Period("2012-02-01", freq="D"), ], @@ -454,3 +456,22 @@ def test_no_sort_keep_na(values, dtype, test_series): # TODO: Slicing reorders categories? expected.index = expected.index.reorder_categories(["y", "x"]) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("test_series", [True, False]) +@pytest.mark.parametrize("dtype", [object, None]) +def test_null_is_null_for_dtype( + sort, dtype, nulls_fixture, nulls_fixture2, test_series +): + # GH#48506 - groups should always result in using the null for the dtype + df = pd.DataFrame({"a": [1, 2]}) + groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype) + obj = df["a"] if test_series else df + gb = obj.groupby(groups, dropna=False, sort=sort) + result = gb.sum() + index = pd.Index([na_value_for_dtype(groups.dtype)]) + expected = pd.DataFrame({"a": [3]}, index=index) + if test_series: + tm.assert_series_equal(result, expected["a"]) + else: + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 2bdd9dc8fb7b4..80271c13cd35d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -468,7 +468,7 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): ( ["a", None, "b", "a"], np.array([0, 1, 2, 0], dtype=np.dtype("intp")), - np.array(["a", None, "b"], dtype=object), + np.array(["a", np.nan, "b"], dtype=object), ), ( ["a", np.nan, "b", "a"], @@ -482,8 +482,8 @@ def test_object_factorize_use_na_sentinel_false( ): codes, uniques = algos.factorize(data, use_na_sentinel=False) - tm.assert_numpy_array_equal(uniques, expected_uniques) - tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True) + tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True) @pytest.mark.parametrize( "data, expected_codes, expected_uniques", @@ -491,7 +491,7 @@ def test_object_factorize_use_na_sentinel_false( ( [1, None, 1, 2], np.array([0, 1, 0, 2], dtype=np.dtype("intp")), - np.array([1, None, 2], dtype="O"), + np.array([1, np.nan, 2], dtype="O"), ), ( [1, np.nan, 1, 2], @@ -505,8 +505,8 @@ def test_int_factorize_use_na_sentinel_false( ): codes, uniques = algos.factorize(data, use_na_sentinel=False) - tm.assert_numpy_array_equal(uniques, expected_uniques) - tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True) + tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True) class TestUnique: