From 9381381874eb2075c6942ceef960955eb59040fb Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Thu, 25 Jan 2024 23:12:07 +0100 Subject: [PATCH] fix(python): Use `date_as_object=False` as default for `Series.to_pandas` (just like `DataFrame.to_pandas`) (#13984) --- py-polars/polars/dataframe/frame.py | 75 +++++++++---------- py-polars/polars/series/series.py | 68 +++++++++-------- .../tests/unit/interop/test_to_pandas.py | 63 ++++++++++------ 3 files changed, 117 insertions(+), 89 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index a7c499017a12..e5e141437c06 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -2189,80 +2189,79 @@ def to_numpy( return out - def to_pandas( # noqa: D417 + def to_pandas( self, - *args: Any, + *, use_pyarrow_extension_array: bool = False, **kwargs: Any, ) -> pd.DataFrame: """ - Cast to a pandas DataFrame. + Convert this DataFrame to a pandas DataFrame. - This requires that :mod:`pandas` and :mod:`pyarrow` are installed. - This operation clones data, unless `use_pyarrow_extension_array=True`. + This operation copies data if `use_pyarrow_extension_array` is not enabled. Parameters ---------- use_pyarrow_extension_array - Use PyArrow backed-extension arrays instead of numpy arrays for each column - of the pandas DataFrame; this allows zero copy operations and preservation + Use PyArrow-backed extension arrays instead of NumPy arrays for the columns + of the pandas DataFrame. This allows zero copy operations and preservation of null values. Subsequent operations on the resulting pandas DataFrame may - trigger conversion to NumPy arrays if that operation is not supported by - pyarrow compute functions. + trigger conversion to NumPy if those operations are not supported by PyArrow + compute functions. **kwargs - Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + Additional keyword arguments to be passed to + :meth:`pyarrow.Table.to_pandas`. Returns ------- :class:`pandas.DataFrame` + Notes + ----- + This operation requires that both :mod:`pandas` and :mod:`pyarrow` are + installed. + Examples -------- - >>> import pandas - >>> df1 = pl.DataFrame( + >>> df = pl.DataFrame( ... { ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], + ... "bar": [6.0, 7.0, 8.0], ... "ham": ["a", "b", "c"], ... } ... ) - >>> pandas_df1 = df1.to_pandas() - >>> type(pandas_df1) - - >>> pandas_df1.dtypes - foo int64 - bar int64 - ham object - dtype: object - >>> df2 = pl.DataFrame( + >>> df.to_pandas() + foo bar ham + 0 1 6.0 a + 1 2 7.0 b + 2 3 8.0 c + + Null values in numeric columns are converted to `NaN`. + + >>> df = pl.DataFrame( ... { ... "foo": [1, 2, None], - ... "bar": [6, None, 8], + ... "bar": [6.0, None, 8.0], ... "ham": [None, "b", "c"], ... } ... ) - >>> pandas_df2 = df2.to_pandas() - >>> pandas_df2 + >>> df.to_pandas() foo bar ham 0 1.0 6.0 None 1 2.0 NaN b 2 NaN 8.0 c - >>> pandas_df2.dtypes - foo float64 - bar float64 - ham object - dtype: object - >>> pandas_df2_pa = df2.to_pandas( - ... use_pyarrow_extension_array=True - ... ) # doctest: +SKIP - >>> pandas_df2_pa # doctest: +SKIP + + Pass `use_pyarrow_extension_array=True` to get a pandas DataFrame with columns + backed by PyArrow extension arrays. This will preserve null values. + + >>> df.to_pandas(use_pyarrow_extension_array=True) foo bar ham - 0 1 6 + 0 1 6.0 1 2 b - 2 8 c - >>> pandas_df2_pa.dtypes # doctest: +SKIP + 2 8.0 c + >>> _.dtypes foo int64[pyarrow] - bar int64[pyarrow] + bar double[pyarrow] ham large_string[pyarrow] dtype: object """ diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index aa9566d69d96..d53bc2a99a4a 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -4450,53 +4450,59 @@ def to_arrow(self) -> pa.Array: """ return self._s.to_arrow() - def to_pandas( # noqa: D417 - self, *args: Any, use_pyarrow_extension_array: bool = False, **kwargs: Any + def to_pandas( + self, *, use_pyarrow_extension_array: bool = False, **kwargs: Any ) -> pd.Series[Any]: """ Convert this Series to a pandas Series. - This requires that :mod:`pandas` and :mod:`pyarrow` are installed. - This operation clones data, unless `use_pyarrow_extension_array=True`. + This operation copies data if `use_pyarrow_extension_array` is not enabled. Parameters ---------- use_pyarrow_extension_array - Further operations on this Pandas series, might trigger conversion to numpy. - Use PyArrow backed-extension array instead of numpy array for pandas - Series. This allows zero copy operations and preservation of nulls - values. - Further operations on this pandas Series, might trigger conversion - to NumPy arrays if that operation is not supported by pyarrow compute - functions. - kwargs - Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + Use a PyArrow-backed extension array instead of a NumPy array for the pandas + Series. This allows zero copy operations and preservation of null values. + Subsequent operations on the resulting pandas Series may trigger conversion + to NumPy if those operations are not supported by PyArrow compute functions. + **kwargs + Additional keyword arguments to be passed to + :meth:`pyarrow.Array.to_pandas`. + + Returns + ------- + :class:`pandas.Series` + + Notes + ----- + This operation requires that both :mod:`pandas` and :mod:`pyarrow` are + installed. Examples -------- - >>> s1 = pl.Series("a", [1, 2, 3]) - >>> s1.to_pandas() + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_pandas() 0 1 1 2 2 3 Name: a, dtype: int64 - >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP - 0 1 - 1 2 - 2 3 - Name: a, dtype: int64[pyarrow] - >>> s2 = pl.Series("b", [1, 2, None, 4]) - >>> s2.to_pandas() + + Null values are converted to `NaN`. + + >>> s = pl.Series("b", [1, 2, None]) + >>> s.to_pandas() 0 1.0 1 2.0 2 NaN - 3 4.0 Name: b, dtype: float64 - >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + + Pass `use_pyarrow_extension_array=True` to get a pandas Series backed by a + PyArrow extension array. This will preserve null values. + + >>> s.to_pandas(use_pyarrow_extension_array=True) 0 1 1 2 2 - 3 4 Name: b, dtype: int64[pyarrow] """ if use_pyarrow_extension_array: @@ -4511,16 +4517,18 @@ def to_pandas( # noqa: D417 else "" ) - pd_series = ( - self.to_arrow().to_pandas( + pa_arr = self.to_arrow() + if use_pyarrow_extension_array: + pd_series = pa_arr.to_pandas( self_destruct=True, split_blocks=True, types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype), **kwargs, ) - if use_pyarrow_extension_array - else self.to_arrow().to_pandas(**kwargs) - ) + else: + date_as_object = kwargs.pop("date_as_object", False) + pd_series = pa_arr.to_pandas(date_as_object=date_as_object, **kwargs) + pd_series.name = self.name return pd_series diff --git a/py-polars/tests/unit/interop/test_to_pandas.py b/py-polars/tests/unit/interop/test_to_pandas.py index cdc976a0e789..6944fbaa220a 100644 --- a/py-polars/tests/unit/interop/test_to_pandas.py +++ b/py-polars/tests/unit/interop/test_to_pandas.py @@ -34,16 +34,15 @@ def test_to_pandas() -> None: ) pd_out = df.to_pandas() - ns_datetimes = pa.__version__ < "13" pd_out_dtypes_expected = [ np.dtype(np.uint8), np.dtype(np.float64), np.dtype(np.float64), - np.dtype(f"datetime64[{'ns' if ns_datetimes else 'ms'}]"), + np.dtype("datetime64[ms]"), np.dtype(np.object_), np.dtype(np.object_), - np.dtype(f"datetime64[{'ns' if ns_datetimes else 'us'}]"), + np.dtype("datetime64[us]"), pd.CategoricalDtype(categories=["a", "b", "c"], ordered=False), pd.CategoricalDtype(categories=["e", "f"], ordered=False), ] @@ -53,24 +52,20 @@ def test_to_pandas() -> None: pd_out = df.to_pandas(date_as_object=True) assert pd_out_dtypes_expected == pd_out.dtypes.to_list() - try: - pd_pa_out = df.to_pandas(use_pyarrow_extension_array=True) - pd_pa_dtypes_names = [dtype.name for dtype in pd_pa_out.dtypes] - pd_pa_dtypes_names_expected = [ - "uint8[pyarrow]", - "int64[pyarrow]", - "double[pyarrow]", - "date32[day][pyarrow]", - "large_string[pyarrow]", - "large_string[pyarrow]", - "timestamp[us][pyarrow]", - "dictionary[pyarrow]", - "dictionary[pyarrow]", - ] - assert pd_pa_dtypes_names == pd_pa_dtypes_names_expected - except ModuleNotFoundError: - # Skip test if Pandas 1.5.x is not installed. - pass + pd_pa_out = df.to_pandas(use_pyarrow_extension_array=True) + pd_pa_dtypes_names = [dtype.name for dtype in pd_pa_out.dtypes] + pd_pa_dtypes_names_expected = [ + "uint8[pyarrow]", + "int64[pyarrow]", + "double[pyarrow]", + "date32[day][pyarrow]", + "large_string[pyarrow]", + "large_string[pyarrow]", + "timestamp[us][pyarrow]", + "dictionary[pyarrow]", + "dictionary[pyarrow]", + ] + assert pd_pa_dtypes_names == pd_pa_dtypes_names_expected def test_cat_to_pandas() -> None: @@ -114,3 +109,29 @@ def test_from_empty_pandas_with_dtypes() -> None: def test_to_pandas_series() -> None: assert (pl.Series("a", [1, 2, 3]).to_pandas() == pd.Series([1, 2, 3])).all() + + +def test_to_pandas_date() -> None: + data = [date(1990, 1, 1), date(2024, 12, 31)] + s = pl.Series("a", data) + + result_series = s.to_pandas() + expected_series = pd.Series(data, dtype="datetime64[ms]", name="a") + pd.testing.assert_series_equal(result_series, expected_series) + + result_df = s.to_frame().to_pandas() + expected_df = expected_series.to_frame() + pd.testing.assert_frame_equal(result_df, expected_df) + + +def test_to_pandas_datetime() -> None: + data = [datetime(1990, 1, 1, 0, 0, 0), datetime(2024, 12, 31, 23, 59, 59)] + s = pl.Series("a", data) + + result_series = s.to_pandas() + expected_series = pd.Series(data, dtype="datetime64[us]", name="a") + pd.testing.assert_series_equal(result_series, expected_series) + + result_df = s.to_frame().to_pandas() + expected_df = expected_series.to_frame() + pd.testing.assert_frame_equal(result_df, expected_df)