Skip to content

Commit

Permalink
[SPARK-43567][PS] Match behavior for with pandas 2
Browse files Browse the repository at this point in the history
  • Loading branch information
itholic committed Aug 1, 2023
1 parent 8053d5f commit 76dc06a
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 43 deletions.
38 changes: 13 additions & 25 deletions python/pyspark/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1614,7 +1614,7 @@ def take(self: IndexOpsLike, indices: Sequence[int]) -> IndexOpsLike:
return cast(IndexOpsLike, self._psdf.iloc[indices].index)

def factorize(
self: IndexOpsLike, sort: bool = True, na_sentinel: Optional[int] = -1
self: IndexOpsLike, sort: bool = True, use_na_sentinel: bool = True
) -> Tuple[IndexOpsLike, pd.Index]:
"""
Encode the object as an enumerated type or categorical variable.
Expand All @@ -1625,11 +1625,10 @@ def factorize(
Parameters
----------
sort : bool, default True
na_sentinel : int or None, default -1
Value to mark "not found". If None, will not drop the NaN
from the uniques of the values.
.. deprecated:: 3.4.0
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NaN values. If False,
NaN values will be encoded as non-negative integers and will not drop the
NaN from the uniques of the values.
Returns
-------
Expand Down Expand Up @@ -1658,7 +1657,7 @@ def factorize(
>>> uniques
Index(['a', 'b', 'c'], dtype='object')
>>> codes, uniques = psser.factorize(na_sentinel=None)
>>> codes, uniques = psser.factorize(use_na_sentinel=False)
>>> codes
0 1
1 3
Expand All @@ -1669,17 +1668,6 @@ def factorize(
>>> uniques
Index(['a', 'b', 'c', None], dtype='object')
>>> codes, uniques = psser.factorize(na_sentinel=-2)
>>> codes
0 1
1 -2
2 0
3 2
4 1
dtype: int32
>>> uniques
Index(['a', 'b', 'c'], dtype='object')
For Index:
>>> psidx = ps.Index(['b', None, 'a', 'c', 'b'])
Expand All @@ -1691,8 +1679,8 @@ def factorize(
"""
from pyspark.pandas.series import first_series

assert (na_sentinel is None) or isinstance(na_sentinel, int)
assert sort is True
use_na_sentinel = -1 if use_na_sentinel else False # type: ignore[assignment]

warnings.warn(
"Argument `na_sentinel` will be removed in 4.0.0.",
Expand All @@ -1716,7 +1704,7 @@ def factorize(
scol = map_scol[self.spark.column]
codes, uniques = self._with_new_scol(
scol.alias(self._internal.data_spark_column_names[0])
).factorize(na_sentinel=na_sentinel)
).factorize(use_na_sentinel=use_na_sentinel)
return codes, uniques.astype(self.dtype)

uniq_sdf = self._internal.spark_frame.select(self.spark.column).distinct()
Expand All @@ -1743,13 +1731,13 @@ def factorize(

# Constructs `unique_to_code` mapping non-na unique to code
unique_to_code = {}
if na_sentinel is not None:
na_sentinel_code = na_sentinel
if use_na_sentinel:
na_sentinel_code = use_na_sentinel
code = 0
for unique in uniques_list:
if pd.isna(unique):
if na_sentinel is None:
na_sentinel_code = code
if not use_na_sentinel:
na_sentinel_code = code # type: ignore[assignment]
else:
unique_to_code[unique] = code
code += 1
Expand All @@ -1767,7 +1755,7 @@ def factorize(

codes = self._with_new_scol(new_scol.alias(self._internal.data_spark_column_names[0]))

if na_sentinel is not None:
if use_na_sentinel:
# Drops the NaN from the uniques of the values
uniques_list = [x for x in uniques_list if not pd.isna(x)]

Expand Down
8 changes: 2 additions & 6 deletions python/pyspark/pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,10 +210,6 @@ def test_astype(self):

self.assert_eq(pscidx.astype(str), pcidx.astype(str))

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43567): Enable CategoricalIndexTests.test_factorize for pandas 2.0.0.",
)
def test_factorize(self):
pidx = pd.CategoricalIndex([1, 2, 3, None])
psidx = ps.from_pandas(pidx)
Expand All @@ -224,8 +220,8 @@ def test_factorize(self):
self.assert_eq(kcodes.tolist(), pcodes.tolist())
self.assert_eq(kuniques, puniques)

pcodes, puniques = pidx.factorize(na_sentinel=-2)
kcodes, kuniques = psidx.factorize(na_sentinel=-2)
pcodes, puniques = pidx.factorize(use_na_sentinel=-2)
kcodes, kuniques = psidx.factorize(use_na_sentinel=-2)

self.assert_eq(kcodes.tolist(), pcodes.tolist())
self.assert_eq(kuniques, puniques)
Expand Down
20 changes: 8 additions & 12 deletions python/pyspark/pandas/tests/series/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,10 +415,6 @@ def test_abs(self):
self.assert_eq(abs(psser), abs(pser))
self.assert_eq(np.abs(psser), np.abs(pser))

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43550): Enable SeriesTests.test_factorize for pandas 2.0.0.",
)
def test_factorize(self):
pser = pd.Series(["a", "b", "a", "b"])
psser = ps.from_pandas(pser)
Expand Down Expand Up @@ -500,27 +496,27 @@ def test_factorize(self):
pser = pd.Series(["a", "b", "a", np.nan, None])
psser = ps.from_pandas(pser)

pcodes, puniques = pser.factorize(sort=True, na_sentinel=-2)
kcodes, kuniques = psser.factorize(na_sentinel=-2)
pcodes, puniques = pser.factorize(sort=True, use_na_sentinel=-2)
kcodes, kuniques = psser.factorize(use_na_sentinel=-2)
self.assert_eq(pcodes.tolist(), kcodes.to_list())
self.assert_eq(puniques, kuniques)

pcodes, puniques = pser.factorize(sort=True, na_sentinel=2)
kcodes, kuniques = psser.factorize(na_sentinel=2)
pcodes, puniques = pser.factorize(sort=True, use_na_sentinel=2)
kcodes, kuniques = psser.factorize(use_na_sentinel=2)
self.assert_eq(pcodes.tolist(), kcodes.to_list())
self.assert_eq(puniques, kuniques)

if not pd_below_1_1_2:
pcodes, puniques = pser.factorize(sort=True, na_sentinel=None)
kcodes, kuniques = psser.factorize(na_sentinel=None)
pcodes, puniques = pser.factorize(sort=True, use_na_sentinel=None)
kcodes, kuniques = psser.factorize(use_na_sentinel=None)
self.assert_eq(pcodes.tolist(), kcodes.to_list())
# puniques is Index(['a', 'b', nan], dtype='object')
self.assert_eq(ps.Index(["a", "b", None]), kuniques)

psser = ps.Series([1, 2, np.nan, 4, 5]) # Arrow takes np.nan as null
psser.loc[3] = np.nan # Spark takes np.nan as NaN
kcodes, kuniques = psser.factorize(na_sentinel=None)
pcodes, puniques = psser._to_pandas().factorize(sort=True, na_sentinel=None)
kcodes, kuniques = psser.factorize(use_na_sentinel=None)
pcodes, puniques = psser._to_pandas().factorize(sort=True, use_na_sentinel=None)
self.assert_eq(pcodes.tolist(), kcodes.to_list())
self.assert_eq(puniques, kuniques)

Expand Down

0 comments on commit 76dc06a

Please sign in to comment.