Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-45164][PS] Remove deprecated Index APIs #42926

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/docs/source/migration_guide/pyspark_upgrade.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ Upgrading from PySpark 3.5 to 4.0
* In Spark 4.0, ``squeeze`` parameter from ``ps.read_csv`` and ``ps.read_excel`` has been removed from pandas API on Spark.
* In Spark 4.0, ``null_counts`` parameter from ``DataFrame.info`` has been removed from pandas API on Spark, use ``show_counts`` instead.
* In Spark 4.0, the result of ``MultiIndex.append`` does not keep the index names from pandas API on Spark.
* In Spark 4.0, ``Index.asi8`` has been removed from pandas API on Spark, use ``Index.astype`` instead.
* In Spark 4.0, ``Index.is_type_compatible`` has been removed from pandas API on Spark, use ``Index.isin`` instead.


Upgrading from PySpark 3.3 to 3.4
Expand Down
63 changes: 0 additions & 63 deletions python/pyspark/pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,44 +624,6 @@ def values(self) -> np.ndarray:
warnings.warn("We recommend using `{}.to_numpy()` instead.".format(type(self).__name__))
return self.to_numpy()

@property
def asi8(self) -> np.ndarray:
"""
Integer representation of the values.

.. warning:: We recommend using `Index.to_numpy()` instead.

.. note:: This method should only be used if the resulting NumPy ndarray is expected
to be small, as all the data is loaded into the driver's memory.

.. deprecated:: 3.4.0

Returns
-------
numpy.ndarray
An ndarray with int64 dtype.

Examples
--------
>>> ps.Index([1, 2, 3]).asi8
array([1, 2, 3])

Returns None for non-int64 dtype

>>> ps.Index(['a', 'b', 'c']).asi8 is None
True
"""
warnings.warn(
"Index.asi8 is deprecated and will be removed in 4.0.0. " "Use Index.astype instead.",
FutureWarning,
)
if isinstance(self.spark.data_type, IntegralType):
return self.to_numpy()
elif isinstance(self.spark.data_type, (TimestampType, TimestampNTZType)):
return np.array(list(map(lambda x: x.astype(np.int64), self.to_numpy())))
else:
return None

@property
def has_duplicates(self) -> bool:
"""
Expand Down Expand Up @@ -1118,31 +1080,6 @@ def is_object(self) -> bool:
"""
return is_object_dtype(self.dtype)

def is_type_compatible(self, kind: str) -> bool:
"""
Whether the index type is compatible with the provided type.

.. deprecated:: 3.4.0

Examples
--------
>>> psidx = ps.Index([1, 2, 3])
>>> psidx.is_type_compatible('integer')
True

>>> psidx = ps.Index([1.0, 2.0, 3.0])
>>> psidx.is_type_compatible('integer')
False
>>> psidx.is_type_compatible('floating')
True
"""
warnings.warn(
"Index.is_type_compatible is deprecated and will be removed in 4.0.0. "
"Use Index.isin instead.",
FutureWarning,
)
return kind == self.inferred_type

def dropna(self, how: str = "any") -> "Index":
"""
Return Index or MultiIndex without NA/NaN values
Expand Down
8 changes: 0 additions & 8 deletions python/pyspark/pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1267,14 +1267,6 @@ def inferred_type(self) -> str:
# Always returns "mixed" for MultiIndex
return "mixed"

@property
def asi8(self) -> None:
"""
Integer representation of the values.
"""
# Always returns None for MultiIndex
return None

def factorize(
self, sort: bool = True, na_sentinel: Optional[int] = -1
) -> Tuple["MultiIndex", pd.Index]:
Expand Down
67 changes: 0 additions & 67 deletions python/pyspark/pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1653,73 +1653,6 @@ def test_is_type_compatible(self):
pmidx.is_type_compatible(data_type), psmidx.is_type_compatible(data_type)
)

def test_asi8(self):
# Integer
pidx = pd.Index([1, 2, 3])
psidx = ps.from_pandas(pidx)
# asi8 is removed from pandas 2.0.0.
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
self.assert_eq(np.array(pidx), psidx.asi8)
self.assert_eq(np.array(pidx.astype("int")), psidx.astype("int").asi8)
self.assert_eq(np.array(pidx.astype("int16")), psidx.astype("int16").asi8)
self.assert_eq(np.array(pidx.astype("int8")), psidx.astype("int8").asi8)
else:
self.assert_eq(pidx.asi8, psidx.asi8)
self.assert_eq(pidx.astype("int").asi8, psidx.astype("int").asi8)
self.assert_eq(pidx.astype("int16").asi8, psidx.astype("int16").asi8)
self.assert_eq(pidx.astype("int8").asi8, psidx.astype("int8").asi8)

# Integer with missing value
pidx = pd.Index([1, 2, None, 4, 5])
psidx = ps.from_pandas(pidx)
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
self.assert_eq(None, psidx.asi8)
else:
self.assert_eq(pidx.asi8, psidx.asi8)

# Datetime
pidx = pd.date_range(end="1/1/2018", periods=3)
psidx = ps.from_pandas(pidx)
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
self.assert_eq(
np.array([1514592000000000000, 1514678400000000000, 1514764800000000000]),
psidx.asi8,
)
else:
self.assert_eq(pidx.asi8, psidx.asi8)

# Floating
pidx = pd.Index([1.0, 2.0, 3.0])
psidx = ps.from_pandas(pidx)
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
self.assert_eq(None, psidx.asi8)
else:
self.assert_eq(pidx.asi8, psidx.asi8)

# String
pidx = pd.Index(["a", "b", "c"])
psidx = ps.from_pandas(pidx)
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
self.assert_eq(None, psidx.asi8)
else:
self.assert_eq(pidx.asi8, psidx.asi8)

# Boolean
pidx = pd.Index([True, False, True, False])
psidx = ps.from_pandas(pidx)
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
self.assert_eq(None, psidx.asi8)
else:
self.assert_eq(pidx.asi8, psidx.asi8)

# MultiIndex
pmidx = pd.MultiIndex.from_tuples([(1, 2)])
psmidx = ps.from_pandas(pmidx)
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
self.assert_eq(None, psmidx.asi8)
else:
self.assert_eq(pmidx.asi8, psmidx.asi8)

def test_index_is_unique(self):
indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
names = [None, "ks", "ks", None]
Expand Down