Skip to content

Commit

Permalink
fix(python): Make to_pandas() work for Dataframe and Series with dt…
Browse files Browse the repository at this point in the history
…ype `Object` (#13910)

Co-authored-by: Itamar Turner-Trauring <itamar@pythonspeed.com>
Co-authored-by: Stijn de Gooijer <stijndegooijer@gmail.com>
  • Loading branch information
3 people committed Jan 26, 2024
1 parent f3c4cc5 commit 79a0984
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 11 deletions.
63 changes: 59 additions & 4 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2266,9 +2266,6 @@ def to_pandas(
ham large_string[pyarrow]
dtype: object
"""
if not self.width: # 0x0 dataframe, cannot infer schema from batches
return pd.DataFrame()

if use_pyarrow_extension_array:
if parse_version(pd.__version__) < parse_version("1.5"):
msg = f'pandas>=1.5.0 is required for `to_pandas("use_pyarrow_extension_array=True")`, found Pandas {pd.__version__!r}'
Expand All @@ -2281,7 +2278,65 @@ def to_pandas(
else:
raise ModuleNotFoundError(msg)

record_batches = self._df.to_pandas()
# Object columns must be handled separately as Arrow does not convert them
# correctly
if Object in self.dtypes:
return self._to_pandas_with_object_columns(
use_pyarrow_extension_array=use_pyarrow_extension_array, **kwargs
)

return self._to_pandas_without_object_columns(
self, use_pyarrow_extension_array=use_pyarrow_extension_array, **kwargs
)

def _to_pandas_with_object_columns(
self,
*,
use_pyarrow_extension_array: bool,
**kwargs: Any,
) -> pd.DataFrame:
# Find which columns are of type pl.Object, and which aren't:
object_columns = []
not_object_columns = []
for i, dtype in enumerate(self.dtypes):
if dtype == Object:
object_columns.append(i)
else:
not_object_columns.append(i)

# Export columns that aren't pl.Object, in the same order:
if not_object_columns:
df_without_objects = self[:, not_object_columns]
pandas_df = self._to_pandas_without_object_columns(
df_without_objects,
use_pyarrow_extension_array=use_pyarrow_extension_array,
**kwargs,
)
else:
pandas_df = pd.DataFrame()

# Add columns that are pl.Object, using Series' custom to_pandas()
# logic for this case. We do this in order, so the original index for
# the next column in this dataframe is correct for the partially
# constructed Pandas dataframe, since there are no additional or
# missing columns to the inserted column's left.
for i in object_columns:
name = self.columns[i]
pandas_df.insert(i, name, self.to_series(i).to_pandas())

return pandas_df

def _to_pandas_without_object_columns(
self,
df: DataFrame,
*,
use_pyarrow_extension_array: bool,
**kwargs: Any,
) -> pd.DataFrame:
if not df.width: # Empty dataframe, cannot infer schema from batches
return pd.DataFrame()

record_batches = df._df.to_pandas()
tbl = pa.Table.from_batches(record_batches)
if use_pyarrow_extension_array:
return tbl.to_pandas(
Expand Down
4 changes: 4 additions & 0 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4492,6 +4492,10 @@ def to_pandas(
2 <NA>
Name: b, dtype: int64[pyarrow]
"""
if self.dtype == Object:
# Can't convert via PyArrow, so do it via NumPy:
return pd.Series(self.to_numpy(), dtype=object, name=self.name)

if use_pyarrow_extension_array:
if parse_version(pd.__version__) < (1, 5):
msg = f'pandas>=1.5.0 is required for `to_pandas("use_pyarrow_extension_array=True")`, found Pandas {pd.__version__}'
Expand Down
6 changes: 5 additions & 1 deletion py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,11 @@ impl PyDataFrame {
})
}

/// Create a `Vec` of PyArrow RecordBatch instances.
///
/// Note this will give bad results for columns with dtype `pl.Object`,
/// since those can't be converted correctly via PyArrow. The calling Python
/// code should make sure these are not included.
pub fn to_pandas(&mut self) -> PyResult<Vec<PyObject>> {
self.df.as_single_chunk_par();
Python::with_gil(|py| {
Expand All @@ -891,7 +896,6 @@ impl PyDataFrame {
})
.map(|(i, _)| i)
.collect::<Vec<_>>();

let rbs = self
.df
.iter_chunks(false)
Expand Down
42 changes: 42 additions & 0 deletions py-polars/tests/unit/interop/test_to_pandas.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
from __future__ import annotations

from datetime import date, datetime
from typing import Literal

import numpy as np
import pandas as pd
import pyarrow as pa
import pytest
from hypothesis import given
from hypothesis.strategies import just, lists, one_of

import polars as pl

Expand Down Expand Up @@ -83,6 +88,32 @@ def test_cat_to_pandas(dtype: pl.DataType) -> None:
)


@given(
column_type_names=lists(
one_of(just("Object"), just("Int32")), min_size=1, max_size=8
)
)
def test_object_to_pandas(column_type_names: list[Literal["Object", "Int32"]]) -> None:
"""
Converting ``pl.Object`` dtype columns to Pandas is handled correctly.
This edge case is handled with a separate code path than other data types,
so we test it more thoroughly.
"""
column_types = [getattr(pl, name) for name in column_type_names]
data = {
f"col_{i}": [object()] if dtype == pl.Object else [-i]
for i, dtype in enumerate(column_types)
}
df = pl.DataFrame(
data, schema={f"col_{i}": column_types[i] for i in range(len(column_types))}
)
for pyarrow in [True, False]:
pandas_df = df.to_pandas(use_pyarrow_extension_array=pyarrow)
assert isinstance(pandas_df, pd.DataFrame)
assert pandas_df.to_dict(orient="list") == data


def test_from_empty_pandas_with_dtypes() -> None:
df = pd.DataFrame(columns=["a", "b"])
df["a"] = df["a"].astype(str)
Expand Down Expand Up @@ -137,3 +168,14 @@ def test_to_pandas_datetime() -> None:
result_df = s.to_frame().to_pandas()
expected_df = expected_series.to_frame()
pd.testing.assert_frame_equal(result_df, expected_df)


@pytest.mark.parametrize("use_pyarrow_extension_array", [True, False])
def test_object_to_pandas_series(use_pyarrow_extension_array: bool) -> None:
values = [object(), [1, 2, 3]]
pd.testing.assert_series_equal(
pl.Series("a", values, dtype=pl.Object).to_pandas(
use_pyarrow_extension_array=use_pyarrow_extension_array
),
pd.Series(values, dtype=object, name="a"),
)
6 changes: 0 additions & 6 deletions py-polars/tests/unit/namespaces/test_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import pytest

import polars as pl
from polars.exceptions import PolarsPanicError

# Calling `plot` the first time is slow
# https://github.com/pola-rs/polars/issues/13500
Expand Down Expand Up @@ -39,8 +38,3 @@ def test_series_hist() -> None:

def test_empty_dataframe() -> None:
pl.DataFrame({"a": [], "b": []}).plot.scatter(x="a", y="b")


def test_unsupported_dtype() -> None:
with pytest.raises(PolarsPanicError):
pl.DataFrame({"a": [{1, 2}], "b": [4]}).plot.scatter(x="a", y="b")

0 comments on commit 79a0984

Please sign in to comment.