Skip to content

Commit

Permalink
Backport PR #48539 on branch 1.5.x (REGR: groupby doesn't identify nu…
Browse files Browse the repository at this point in the history
…ll values when sort=False) (#48568)

Backport PR #48539: REGR: groupby doesn't identify null values when sort=False

Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
  • Loading branch information
meeseeksmachine and rhshadrach authored Sep 15, 2022
1 parent 6ee47f9 commit 19a3f5a
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 8 deletions.
1 change: 0 additions & 1 deletion doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1016,7 +1016,6 @@ Numeric
- Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`)
- Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
- Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`)
- Bug in :func:`factorize` would convert the value ``None`` to ``np.nan`` (:issue:`46601`)

Conversion
^^^^^^^^^^
Expand Down
11 changes: 11 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,17 @@ def factorize_array(

hash_klass, values = _get_hashtable_algo(values)

# factorize can now handle differentiating various types of null values.
# However, for backwards compatibility we only use the null for the
# provided dtype. This may be revisited in the future, see GH#48476.
null_mask = isna(values)
if null_mask.any():
na_value = na_value_for_dtype(values.dtype, compat=False)
# Don't modify (potentially user-provided) array
# error: No overload variant of "where" matches argument types "Any", "object",
# "ndarray[Any, Any]"
values = np.where(null_mask, na_value, values) # type: ignore[call-overload]

table = hash_klass(size_hint or len(values))
uniques, codes = table.factorize(
values,
Expand Down
23 changes: 22 additions & 1 deletion pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from pandas.compat.pyarrow import pa_version_under1p01

from pandas.core.dtypes.missing import na_value_for_dtype

import pandas as pd
import pandas._testing as tm

Expand Down Expand Up @@ -422,7 +424,7 @@ def test_groupby_drop_nan_with_multi_index():
(
[
pd.Period("2012-02-01", freq="D"),
pd.NA,
pd.NaT,
pd.Period("2012-01-01", freq="D"),
pd.Period("2012-02-01", freq="D"),
],
Expand Down Expand Up @@ -454,3 +456,22 @@ def test_no_sort_keep_na(values, dtype, test_series):
# TODO: Slicing reorders categories?
expected.index = expected.index.reorder_categories(["y", "x"])
tm.assert_equal(result, expected)


@pytest.mark.parametrize("test_series", [True, False])
@pytest.mark.parametrize("dtype", [object, None])
def test_null_is_null_for_dtype(
sort, dtype, nulls_fixture, nulls_fixture2, test_series
):
# GH#48506 - groups should always result in using the null for the dtype
df = pd.DataFrame({"a": [1, 2]})
groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype)
obj = df["a"] if test_series else df
gb = obj.groupby(groups, dropna=False, sort=sort)
result = gb.sum()
index = pd.Index([na_value_for_dtype(groups.dtype)])
expected = pd.DataFrame({"a": [3]}, index=index)
if test_series:
tm.assert_series_equal(result, expected["a"])
else:
tm.assert_frame_equal(result, expected)
12 changes: 6 additions & 6 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
(
["a", None, "b", "a"],
np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
np.array(["a", None, "b"], dtype=object),
np.array(["a", np.nan, "b"], dtype=object),
),
(
["a", np.nan, "b", "a"],
Expand All @@ -482,16 +482,16 @@ def test_object_factorize_use_na_sentinel_false(
):
codes, uniques = algos.factorize(data, use_na_sentinel=False)

tm.assert_numpy_array_equal(uniques, expected_uniques)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)

@pytest.mark.parametrize(
"data, expected_codes, expected_uniques",
[
(
[1, None, 1, 2],
np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
np.array([1, None, 2], dtype="O"),
np.array([1, np.nan, 2], dtype="O"),
),
(
[1, np.nan, 1, 2],
Expand All @@ -505,8 +505,8 @@ def test_int_factorize_use_na_sentinel_false(
):
codes, uniques = algos.factorize(data, use_na_sentinel=False)

tm.assert_numpy_array_equal(uniques, expected_uniques)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)


class TestUnique:
Expand Down

0 comments on commit 19a3f5a

Please sign in to comment.