From 9d7b0c9ef4064c7554a9fa5ad0bf3bededf8c293 Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Thu, 14 Sep 2023 18:32:49 +0900 Subject: [PATCH 1/3] [SPARK-45164][PS] Remove deprecated Index APIs --- python/pyspark/pandas/indexes/base.py | 63 ----------------- python/pyspark/pandas/indexes/multi.py | 8 --- .../pyspark/pandas/tests/indexes/test_base.py | 67 ------------------- 3 files changed, 138 deletions(-) diff --git a/python/pyspark/pandas/indexes/base.py b/python/pyspark/pandas/indexes/base.py index c7d9f4e57467f..28e98bc4cf204 100644 --- a/python/pyspark/pandas/indexes/base.py +++ b/python/pyspark/pandas/indexes/base.py @@ -624,44 +624,6 @@ def values(self) -> np.ndarray: warnings.warn("We recommend using `{}.to_numpy()` instead.".format(type(self).__name__)) return self.to_numpy() - @property - def asi8(self) -> np.ndarray: - """ - Integer representation of the values. - - .. warning:: We recommend using `Index.to_numpy()` instead. - - .. note:: This method should only be used if the resulting NumPy ndarray is expected - to be small, as all the data is loaded into the driver's memory. - - .. deprecated:: 3.4.0 - - Returns - ------- - numpy.ndarray - An ndarray with int64 dtype. - - Examples - -------- - >>> ps.Index([1, 2, 3]).asi8 - array([1, 2, 3]) - - Returns None for non-int64 dtype - - >>> ps.Index(['a', 'b', 'c']).asi8 is None - True - """ - warnings.warn( - "Index.asi8 is deprecated and will be removed in 4.0.0. " "Use Index.astype instead.", - FutureWarning, - ) - if isinstance(self.spark.data_type, IntegralType): - return self.to_numpy() - elif isinstance(self.spark.data_type, (TimestampType, TimestampNTZType)): - return np.array(list(map(lambda x: x.astype(np.int64), self.to_numpy()))) - else: - return None - @property def has_duplicates(self) -> bool: """ @@ -1118,31 +1080,6 @@ def is_object(self) -> bool: """ return is_object_dtype(self.dtype) - def is_type_compatible(self, kind: str) -> bool: - """ - Whether the index type is compatible with the provided type. - - .. deprecated:: 3.4.0 - - Examples - -------- - >>> psidx = ps.Index([1, 2, 3]) - >>> psidx.is_type_compatible('integer') - True - - >>> psidx = ps.Index([1.0, 2.0, 3.0]) - >>> psidx.is_type_compatible('integer') - False - >>> psidx.is_type_compatible('floating') - True - """ - warnings.warn( - "Index.is_type_compatible is deprecated and will be removed in 4.0.0. " - "Use Index.isin instead.", - FutureWarning, - ) - return kind == self.inferred_type - def dropna(self, how: str = "any") -> "Index": """ Return Index or MultiIndex without NA/NaN values diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py index dd93e31d0235e..043d6762fb7d6 100644 --- a/python/pyspark/pandas/indexes/multi.py +++ b/python/pyspark/pandas/indexes/multi.py @@ -1267,14 +1267,6 @@ def inferred_type(self) -> str: # Always returns "mixed" for MultiIndex return "mixed" - @property - def asi8(self) -> None: - """ - Integer representation of the values. - """ - # Always returns None for MultiIndex - return None - def factorize( self, sort: bool = True, na_sentinel: Optional[int] = -1 ) -> Tuple["MultiIndex", pd.Index]: diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index bdec1d602bf17..4cc337f602403 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -1653,73 +1653,6 @@ def test_is_type_compatible(self): pmidx.is_type_compatible(data_type), psmidx.is_type_compatible(data_type) ) - def test_asi8(self): - # Integer - pidx = pd.Index([1, 2, 3]) - psidx = ps.from_pandas(pidx) - # asi8 is removed from pandas 2.0.0. - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - self.assert_eq(np.array(pidx), psidx.asi8) - self.assert_eq(np.array(pidx.astype("int")), psidx.astype("int").asi8) - self.assert_eq(np.array(pidx.astype("int16")), psidx.astype("int16").asi8) - self.assert_eq(np.array(pidx.astype("int8")), psidx.astype("int8").asi8) - else: - self.assert_eq(pidx.asi8, psidx.asi8) - self.assert_eq(pidx.astype("int").asi8, psidx.astype("int").asi8) - self.assert_eq(pidx.astype("int16").asi8, psidx.astype("int16").asi8) - self.assert_eq(pidx.astype("int8").asi8, psidx.astype("int8").asi8) - - # Integer with missing value - pidx = pd.Index([1, 2, None, 4, 5]) - psidx = ps.from_pandas(pidx) - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - self.assert_eq(None, psidx.asi8) - else: - self.assert_eq(pidx.asi8, psidx.asi8) - - # Datetime - pidx = pd.date_range(end="1/1/2018", periods=3) - psidx = ps.from_pandas(pidx) - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - self.assert_eq( - np.array([1514592000000000000, 1514678400000000000, 1514764800000000000]), - psidx.asi8, - ) - else: - self.assert_eq(pidx.asi8, psidx.asi8) - - # Floating - pidx = pd.Index([1.0, 2.0, 3.0]) - psidx = ps.from_pandas(pidx) - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - self.assert_eq(None, psidx.asi8) - else: - self.assert_eq(pidx.asi8, psidx.asi8) - - # String - pidx = pd.Index(["a", "b", "c"]) - psidx = ps.from_pandas(pidx) - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - self.assert_eq(None, psidx.asi8) - else: - self.assert_eq(pidx.asi8, psidx.asi8) - - # Boolean - pidx = pd.Index([True, False, True, False]) - psidx = ps.from_pandas(pidx) - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - self.assert_eq(None, psidx.asi8) - else: - self.assert_eq(pidx.asi8, psidx.asi8) - - # MultiIndex - pmidx = pd.MultiIndex.from_tuples([(1, 2)]) - psmidx = ps.from_pandas(pmidx) - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - self.assert_eq(None, psmidx.asi8) - else: - self.assert_eq(pmidx.asi8, psmidx.asi8) - def test_index_is_unique(self): indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)] names = [None, "ks", "ks", None] From c5fd74b7cdd56a93e2721586a80b58adf1dc7878 Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Thu, 14 Sep 2023 18:35:27 +0900 Subject: [PATCH 2/3] Add migration guide --- python/docs/source/migration_guide/pyspark_upgrade.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst b/python/docs/source/migration_guide/pyspark_upgrade.rst index d743384dee63f..76e5c701b7966 100644 --- a/python/docs/source/migration_guide/pyspark_upgrade.rst +++ b/python/docs/source/migration_guide/pyspark_upgrade.rst @@ -41,6 +41,8 @@ Upgrading from PySpark 3.5 to 4.0 * In Spark 4.0, ``squeeze`` parameter from ``ps.read_csv`` and ``ps.read_excel`` has been removed from pandas API on Spark. * In Spark 4.0, ``null_counts`` parameter from ``DataFrame.info`` has been removed from pandas API on Spark, use ``show_counts`` instead. * In Spark 4.0, the result of ``MultiIndex.append`` does not keep the index names from pandas API on Spark. +* In Spark 4.0, ``Index.asi8`` has been removed from pandas API on Spark, use ``Index.astype`` instead. +* In Spark 4.0, ``Index.is_type_compatible`` has been removed from pandas API on Spark, use ``Index.isin`` instead. Upgrading from PySpark 3.3 to 3.4 From b34abb33f043e7eee767c833d0960824c3e78404 Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Fri, 15 Sep 2023 10:43:43 +0900 Subject: [PATCH 3/3] Remove leftovers --- .../pyspark/pandas/tests/indexes/test_base.py | 72 ------------------- 1 file changed, 72 deletions(-) diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index 4cc337f602403..678a9216ae2aa 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -1581,78 +1581,6 @@ def test_multiindex_from_frame(self): psdf = ps.from_pandas(pdf) self.assert_eq(ps.MultiIndex.from_frame(psdf), pd.MultiIndex.from_frame(pdf)) - def test_is_type_compatible(self): - data_types = ["integer", "floating", "string", "boolean"] - # Integer - pidx = pd.Index([1, 2, 3]) - psidx = ps.from_pandas(pidx) - # is_type_compatible is removed from pandas 2.0.0. - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - expected_results = [True, False, False, False] - for data_type, expected_result in zip(data_types, expected_results): - self.assert_eq(psidx.is_type_compatible(data_type), expected_result) - else: - for data_type in data_types: - self.assert_eq( - pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type) - ) - - # Floating - pidx = pd.Index([1.0, 2.0, 3.0]) - psidx = ps.from_pandas(pidx) - # is_type_compatible is removed from pandas 2.0.0. - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - expected_results = [False, True, False, False] - for data_type, expected_result in zip(data_types, expected_results): - self.assert_eq(psidx.is_type_compatible(data_type), expected_result) - else: - for data_type in data_types: - self.assert_eq( - pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type) - ) - - # String - pidx = pd.Index(["a", "b", "c"]) - psidx = ps.from_pandas(pidx) - # is_type_compatible is removed from pandas 2.0.0. - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - expected_results = [False, False, True, False] - for data_type, expected_result in zip(data_types, expected_results): - self.assert_eq(psidx.is_type_compatible(data_type), expected_result) - else: - for data_type in data_types: - self.assert_eq( - pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type) - ) - - # Boolean - pidx = pd.Index([True, False, True, False]) - psidx = ps.from_pandas(pidx) - # is_type_compatible is removed from pandas 2.0.0. - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - expected_results = [False, False, False, True] - for data_type, expected_result in zip(data_types, expected_results): - self.assert_eq(psidx.is_type_compatible(data_type), expected_result) - else: - for data_type in data_types: - self.assert_eq( - pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type) - ) - - # MultiIndex - pmidx = pd.MultiIndex.from_tuples([("a", "x")]) - psmidx = ps.from_pandas(pmidx) - # is_type_compatible is removed from pandas 2.0.0. - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - expected_results = [False, False, False, False] - for data_type, expected_result in zip(data_types, expected_results): - self.assert_eq(psmidx.is_type_compatible(data_type), expected_result) - else: - for data_type in data_types: - self.assert_eq( - pmidx.is_type_compatible(data_type), psmidx.is_type_compatible(data_type) - ) - def test_index_is_unique(self): indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)] names = [None, "ks", "ks", None]