From fce1586ae4657269946cce2caf51dd45fbe776d0 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
Date: Thu, 15 Sep 2022 12:49:16 -0400
Subject: [PATCH] Backport PR #48539: REGR: groupby doesn't identify null
 values when sort=False

---
 doc/source/whatsnew/v1.5.0.rst              |  1 -
 pandas/core/algorithms.py                   | 11 ++++++++++
 pandas/tests/groupby/test_groupby_dropna.py | 23 ++++++++++++++++++++-
 pandas/tests/test_algos.py                  | 12 +++++------
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 581cc9b5f61a9..bed545fccba1c 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -1016,7 +1016,6 @@ Numeric
 - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`)
 - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
 - Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`)
-- Bug in :func:`factorize` would convert the value ``None`` to ``np.nan`` (:issue:`46601`)
 
 Conversion
 ^^^^^^^^^^
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 2e6737b2e61aa..9b16032a1d418 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -566,6 +566,17 @@ def factorize_array(
 
     hash_klass, values = _get_hashtable_algo(values)
 
+    # factorize can now handle differentiating various types of null values.
+    # However, for backwards compatibility we only use the null for the
+    # provided dtype. This may be revisited in the future, see GH#48476.
+    null_mask = isna(values)
+    if null_mask.any():
+        na_value = na_value_for_dtype(values.dtype, compat=False)
+        # Don't modify (potentially user-provided) array
+        # error: No overload variant of "where" matches argument types "Any", "object",
+        # "ndarray[Any, Any]"
+        values = np.where(null_mask, na_value, values)  # type: ignore[call-overload]
+
     table = hash_klass(size_hint or len(values))
     uniques, codes = table.factorize(
         values,
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index 394b5adcf0370..b2426ffa9dad3 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -3,6 +3,8 @@
 
 from pandas.compat.pyarrow import pa_version_under1p01
 
+from pandas.core.dtypes.missing import na_value_for_dtype
+
 import pandas as pd
 import pandas._testing as tm
 
@@ -422,7 +424,7 @@ def test_groupby_drop_nan_with_multi_index():
         (
             [
                 pd.Period("2012-02-01", freq="D"),
-                pd.NA,
+                pd.NaT,
                 pd.Period("2012-01-01", freq="D"),
                 pd.Period("2012-02-01", freq="D"),
             ],
@@ -454,3 +456,22 @@ def test_no_sort_keep_na(values, dtype, test_series):
         # TODO: Slicing reorders categories?
         expected.index = expected.index.reorder_categories(["y", "x"])
     tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize("test_series", [True, False])
+@pytest.mark.parametrize("dtype", [object, None])
+def test_null_is_null_for_dtype(
+    sort, dtype, nulls_fixture, nulls_fixture2, test_series
+):
+    # GH#48506 - groups should always result in using the null for the dtype
+    df = pd.DataFrame({"a": [1, 2]})
+    groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype)
+    obj = df["a"] if test_series else df
+    gb = obj.groupby(groups, dropna=False, sort=sort)
+    result = gb.sum()
+    index = pd.Index([na_value_for_dtype(groups.dtype)])
+    expected = pd.DataFrame({"a": [3]}, index=index)
+    if test_series:
+        tm.assert_series_equal(result, expected["a"])
+    else:
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 2bdd9dc8fb7b4..80271c13cd35d 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -468,7 +468,7 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
             (
                 ["a", None, "b", "a"],
                 np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
-                np.array(["a", None, "b"], dtype=object),
+                np.array(["a", np.nan, "b"], dtype=object),
             ),
             (
                 ["a", np.nan, "b", "a"],
@@ -482,8 +482,8 @@ def test_object_factorize_use_na_sentinel_false(
     ):
         codes, uniques = algos.factorize(data, use_na_sentinel=False)
 
-        tm.assert_numpy_array_equal(uniques, expected_uniques)
-        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
+        tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
 
     @pytest.mark.parametrize(
         "data, expected_codes, expected_uniques",
@@ -491,7 +491,7 @@ def test_object_factorize_use_na_sentinel_false(
             (
                 [1, None, 1, 2],
                 np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
-                np.array([1, None, 2], dtype="O"),
+                np.array([1, np.nan, 2], dtype="O"),
             ),
             (
                 [1, np.nan, 1, 2],
@@ -505,8 +505,8 @@ def test_int_factorize_use_na_sentinel_false(
     ):
         codes, uniques = algos.factorize(data, use_na_sentinel=False)
 
-        tm.assert_numpy_array_equal(uniques, expected_uniques)
-        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
+        tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
 
 
 class TestUnique: