Skip to content

Commit

Permalink
fix(python): Use date_as_object=False as default for `Series.to_pan…
Browse files Browse the repository at this point in the history
…das` (just like `DataFrame.to_pandas`) (#13984)
  • Loading branch information
stinodego committed Jan 25, 2024
1 parent a6ee1e9 commit 9381381
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 89 deletions.
75 changes: 37 additions & 38 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2189,80 +2189,79 @@ def to_numpy(

return out

def to_pandas( # noqa: D417
def to_pandas(
self,
*args: Any,
*,
use_pyarrow_extension_array: bool = False,
**kwargs: Any,
) -> pd.DataFrame:
"""
Cast to a pandas DataFrame.
Convert this DataFrame to a pandas DataFrame.
This requires that :mod:`pandas` and :mod:`pyarrow` are installed.
This operation clones data, unless `use_pyarrow_extension_array=True`.
This operation copies data if `use_pyarrow_extension_array` is not enabled.
Parameters
----------
use_pyarrow_extension_array
Use PyArrow backed-extension arrays instead of numpy arrays for each column
of the pandas DataFrame; this allows zero copy operations and preservation
Use PyArrow-backed extension arrays instead of NumPy arrays for the columns
of the pandas DataFrame. This allows zero copy operations and preservation
of null values. Subsequent operations on the resulting pandas DataFrame may
trigger conversion to NumPy arrays if that operation is not supported by
pyarrow compute functions.
trigger conversion to NumPy if those operations are not supported by PyArrow
compute functions.
**kwargs
Arguments will be sent to :meth:`pyarrow.Table.to_pandas`.
Additional keyword arguments to be passed to
:meth:`pyarrow.Table.to_pandas`.
Returns
-------
:class:`pandas.DataFrame`
Notes
-----
This operation requires that both :mod:`pandas` and :mod:`pyarrow` are
installed.
Examples
--------
>>> import pandas
>>> df1 = pl.DataFrame(
>>> df = pl.DataFrame(
... {
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... "bar": [6.0, 7.0, 8.0],
... "ham": ["a", "b", "c"],
... }
... )
>>> pandas_df1 = df1.to_pandas()
>>> type(pandas_df1)
<class 'pandas.core.frame.DataFrame'>
>>> pandas_df1.dtypes
foo int64
bar int64
ham object
dtype: object
>>> df2 = pl.DataFrame(
>>> df.to_pandas()
foo bar ham
0 1 6.0 a
1 2 7.0 b
2 3 8.0 c
Null values in numeric columns are converted to `NaN`.
>>> df = pl.DataFrame(
... {
... "foo": [1, 2, None],
... "bar": [6, None, 8],
... "bar": [6.0, None, 8.0],
... "ham": [None, "b", "c"],
... }
... )
>>> pandas_df2 = df2.to_pandas()
>>> pandas_df2
>>> df.to_pandas()
foo bar ham
0 1.0 6.0 None
1 2.0 NaN b
2 NaN 8.0 c
>>> pandas_df2.dtypes
foo float64
bar float64
ham object
dtype: object
>>> pandas_df2_pa = df2.to_pandas(
... use_pyarrow_extension_array=True
... ) # doctest: +SKIP
>>> pandas_df2_pa # doctest: +SKIP
Pass `use_pyarrow_extension_array=True` to get a pandas DataFrame with columns
backed by PyArrow extension arrays. This will preserve null values.
>>> df.to_pandas(use_pyarrow_extension_array=True)
foo bar ham
0 1 6 <NA>
0 1 6.0 <NA>
1 2 <NA> b
2 <NA> 8 c
>>> pandas_df2_pa.dtypes # doctest: +SKIP
2 <NA> 8.0 c
>>> _.dtypes
foo int64[pyarrow]
bar int64[pyarrow]
bar double[pyarrow]
ham large_string[pyarrow]
dtype: object
"""
Expand Down
68 changes: 38 additions & 30 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4450,53 +4450,59 @@ def to_arrow(self) -> pa.Array:
"""
return self._s.to_arrow()

def to_pandas( # noqa: D417
self, *args: Any, use_pyarrow_extension_array: bool = False, **kwargs: Any
def to_pandas(
self, *, use_pyarrow_extension_array: bool = False, **kwargs: Any
) -> pd.Series[Any]:
"""
Convert this Series to a pandas Series.
This requires that :mod:`pandas` and :mod:`pyarrow` are installed.
This operation clones data, unless `use_pyarrow_extension_array=True`.
This operation copies data if `use_pyarrow_extension_array` is not enabled.
Parameters
----------
use_pyarrow_extension_array
Further operations on this Pandas series, might trigger conversion to numpy.
Use PyArrow backed-extension array instead of numpy array for pandas
Series. This allows zero copy operations and preservation of nulls
values.
Further operations on this pandas Series, might trigger conversion
to NumPy arrays if that operation is not supported by pyarrow compute
functions.
kwargs
Arguments will be sent to :meth:`pyarrow.Table.to_pandas`.
Use a PyArrow-backed extension array instead of a NumPy array for the pandas
Series. This allows zero copy operations and preservation of null values.
Subsequent operations on the resulting pandas Series may trigger conversion
to NumPy if those operations are not supported by PyArrow compute functions.
**kwargs
Additional keyword arguments to be passed to
:meth:`pyarrow.Array.to_pandas`.
Returns
-------
:class:`pandas.Series`
Notes
-----
This operation requires that both :mod:`pandas` and :mod:`pyarrow` are
installed.
Examples
--------
>>> s1 = pl.Series("a", [1, 2, 3])
>>> s1.to_pandas()
>>> s = pl.Series("a", [1, 2, 3])
>>> s.to_pandas()
0 1
1 2
2 3
Name: a, dtype: int64
>>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP
0 1
1 2
2 3
Name: a, dtype: int64[pyarrow]
>>> s2 = pl.Series("b", [1, 2, None, 4])
>>> s2.to_pandas()
Null values are converted to `NaN`.
>>> s = pl.Series("b", [1, 2, None])
>>> s.to_pandas()
0 1.0
1 2.0
2 NaN
3 4.0
Name: b, dtype: float64
>>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP
Pass `use_pyarrow_extension_array=True` to get a pandas Series backed by a
PyArrow extension array. This will preserve null values.
>>> s.to_pandas(use_pyarrow_extension_array=True)
0 1
1 2
2 <NA>
3 4
Name: b, dtype: int64[pyarrow]
"""
if use_pyarrow_extension_array:
Expand All @@ -4511,16 +4517,18 @@ def to_pandas( # noqa: D417
else ""
)

pd_series = (
self.to_arrow().to_pandas(
pa_arr = self.to_arrow()
if use_pyarrow_extension_array:
pd_series = pa_arr.to_pandas(
self_destruct=True,
split_blocks=True,
types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype),
**kwargs,
)
if use_pyarrow_extension_array
else self.to_arrow().to_pandas(**kwargs)
)
else:
date_as_object = kwargs.pop("date_as_object", False)
pd_series = pa_arr.to_pandas(date_as_object=date_as_object, **kwargs)

pd_series.name = self.name
return pd_series

Expand Down
63 changes: 42 additions & 21 deletions py-polars/tests/unit/interop/test_to_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,15 @@ def test_to_pandas() -> None:
)

pd_out = df.to_pandas()
ns_datetimes = pa.__version__ < "13"

pd_out_dtypes_expected = [
np.dtype(np.uint8),
np.dtype(np.float64),
np.dtype(np.float64),
np.dtype(f"datetime64[{'ns' if ns_datetimes else 'ms'}]"),
np.dtype("datetime64[ms]"),
np.dtype(np.object_),
np.dtype(np.object_),
np.dtype(f"datetime64[{'ns' if ns_datetimes else 'us'}]"),
np.dtype("datetime64[us]"),
pd.CategoricalDtype(categories=["a", "b", "c"], ordered=False),
pd.CategoricalDtype(categories=["e", "f"], ordered=False),
]
Expand All @@ -53,24 +52,20 @@ def test_to_pandas() -> None:
pd_out = df.to_pandas(date_as_object=True)
assert pd_out_dtypes_expected == pd_out.dtypes.to_list()

try:
pd_pa_out = df.to_pandas(use_pyarrow_extension_array=True)
pd_pa_dtypes_names = [dtype.name for dtype in pd_pa_out.dtypes]
pd_pa_dtypes_names_expected = [
"uint8[pyarrow]",
"int64[pyarrow]",
"double[pyarrow]",
"date32[day][pyarrow]",
"large_string[pyarrow]",
"large_string[pyarrow]",
"timestamp[us][pyarrow]",
"dictionary<values=large_string, indices=int64, ordered=0>[pyarrow]",
"dictionary<values=large_string, indices=int64, ordered=0>[pyarrow]",
]
assert pd_pa_dtypes_names == pd_pa_dtypes_names_expected
except ModuleNotFoundError:
# Skip test if Pandas 1.5.x is not installed.
pass
pd_pa_out = df.to_pandas(use_pyarrow_extension_array=True)
pd_pa_dtypes_names = [dtype.name for dtype in pd_pa_out.dtypes]
pd_pa_dtypes_names_expected = [
"uint8[pyarrow]",
"int64[pyarrow]",
"double[pyarrow]",
"date32[day][pyarrow]",
"large_string[pyarrow]",
"large_string[pyarrow]",
"timestamp[us][pyarrow]",
"dictionary<values=large_string, indices=int64, ordered=0>[pyarrow]",
"dictionary<values=large_string, indices=int64, ordered=0>[pyarrow]",
]
assert pd_pa_dtypes_names == pd_pa_dtypes_names_expected


def test_cat_to_pandas() -> None:
Expand Down Expand Up @@ -114,3 +109,29 @@ def test_from_empty_pandas_with_dtypes() -> None:

def test_to_pandas_series() -> None:
assert (pl.Series("a", [1, 2, 3]).to_pandas() == pd.Series([1, 2, 3])).all()


def test_to_pandas_date() -> None:
data = [date(1990, 1, 1), date(2024, 12, 31)]
s = pl.Series("a", data)

result_series = s.to_pandas()
expected_series = pd.Series(data, dtype="datetime64[ms]", name="a")
pd.testing.assert_series_equal(result_series, expected_series)

result_df = s.to_frame().to_pandas()
expected_df = expected_series.to_frame()
pd.testing.assert_frame_equal(result_df, expected_df)


def test_to_pandas_datetime() -> None:
data = [datetime(1990, 1, 1, 0, 0, 0), datetime(2024, 12, 31, 23, 59, 59)]
s = pl.Series("a", data)

result_series = s.to_pandas()
expected_series = pd.Series(data, dtype="datetime64[us]", name="a")
pd.testing.assert_series_equal(result_series, expected_series)

result_df = s.to_frame().to_pandas()
expected_df = expected_series.to_frame()
pd.testing.assert_frame_equal(result_df, expected_df)

0 comments on commit 9381381

Please sign in to comment.