From 79a0984df5d380dc2abdc3e8d9a2bdbf48c49b28 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Fri, 26 Jan 2024 10:19:26 -0500 Subject: [PATCH] fix(python): Make `to_pandas()` work for Dataframe and Series with dtype `Object` (#13910) Co-authored-by: Itamar Turner-Trauring Co-authored-by: Stijn de Gooijer --- py-polars/polars/dataframe/frame.py | 63 +++++++++++++++++-- py-polars/polars/series/series.py | 4 ++ py-polars/src/dataframe.rs | 6 +- .../tests/unit/interop/test_to_pandas.py | 42 +++++++++++++ py-polars/tests/unit/namespaces/test_plot.py | 6 -- 5 files changed, 110 insertions(+), 11 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index c5f9bef8503a..cbc5479653b4 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -2266,9 +2266,6 @@ def to_pandas( ham large_string[pyarrow] dtype: object """ - if not self.width: # 0x0 dataframe, cannot infer schema from batches - return pd.DataFrame() - if use_pyarrow_extension_array: if parse_version(pd.__version__) < parse_version("1.5"): msg = f'pandas>=1.5.0 is required for `to_pandas("use_pyarrow_extension_array=True")`, found Pandas {pd.__version__!r}' @@ -2281,7 +2278,65 @@ def to_pandas( else: raise ModuleNotFoundError(msg) - record_batches = self._df.to_pandas() + # Object columns must be handled separately as Arrow does not convert them + # correctly + if Object in self.dtypes: + return self._to_pandas_with_object_columns( + use_pyarrow_extension_array=use_pyarrow_extension_array, **kwargs + ) + + return self._to_pandas_without_object_columns( + self, use_pyarrow_extension_array=use_pyarrow_extension_array, **kwargs + ) + + def _to_pandas_with_object_columns( + self, + *, + use_pyarrow_extension_array: bool, + **kwargs: Any, + ) -> pd.DataFrame: + # Find which columns are of type pl.Object, and which aren't: + object_columns = [] + not_object_columns = [] + for i, dtype in enumerate(self.dtypes): + if dtype == Object: + object_columns.append(i) + else: + not_object_columns.append(i) + + # Export columns that aren't pl.Object, in the same order: + if not_object_columns: + df_without_objects = self[:, not_object_columns] + pandas_df = self._to_pandas_without_object_columns( + df_without_objects, + use_pyarrow_extension_array=use_pyarrow_extension_array, + **kwargs, + ) + else: + pandas_df = pd.DataFrame() + + # Add columns that are pl.Object, using Series' custom to_pandas() + # logic for this case. We do this in order, so the original index for + # the next column in this dataframe is correct for the partially + # constructed Pandas dataframe, since there are no additional or + # missing columns to the inserted column's left. + for i in object_columns: + name = self.columns[i] + pandas_df.insert(i, name, self.to_series(i).to_pandas()) + + return pandas_df + + def _to_pandas_without_object_columns( + self, + df: DataFrame, + *, + use_pyarrow_extension_array: bool, + **kwargs: Any, + ) -> pd.DataFrame: + if not df.width: # Empty dataframe, cannot infer schema from batches + return pd.DataFrame() + + record_batches = df._df.to_pandas() tbl = pa.Table.from_batches(record_batches) if use_pyarrow_extension_array: return tbl.to_pandas( diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index f19280bc801c..f43fced51a1b 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -4492,6 +4492,10 @@ def to_pandas( 2 Name: b, dtype: int64[pyarrow] """ + if self.dtype == Object: + # Can't convert via PyArrow, so do it via NumPy: + return pd.Series(self.to_numpy(), dtype=object, name=self.name) + if use_pyarrow_extension_array: if parse_version(pd.__version__) < (1, 5): msg = f'pandas>=1.5.0 is required for `to_pandas("use_pyarrow_extension_array=True")`, found Pandas {pd.__version__}' diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index 852d4a06e12a..83a2a6b62c58 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -873,6 +873,11 @@ impl PyDataFrame { }) } + /// Create a `Vec` of PyArrow RecordBatch instances. + /// + /// Note this will give bad results for columns with dtype `pl.Object`, + /// since those can't be converted correctly via PyArrow. The calling Python + /// code should make sure these are not included. pub fn to_pandas(&mut self) -> PyResult> { self.df.as_single_chunk_par(); Python::with_gil(|py| { @@ -891,7 +896,6 @@ impl PyDataFrame { }) .map(|(i, _)| i) .collect::>(); - let rbs = self .df .iter_chunks(false) diff --git a/py-polars/tests/unit/interop/test_to_pandas.py b/py-polars/tests/unit/interop/test_to_pandas.py index 7e3b6e5b53a3..2d15c16dcd7f 100644 --- a/py-polars/tests/unit/interop/test_to_pandas.py +++ b/py-polars/tests/unit/interop/test_to_pandas.py @@ -1,9 +1,14 @@ +from __future__ import annotations + from datetime import date, datetime +from typing import Literal import numpy as np import pandas as pd import pyarrow as pa import pytest +from hypothesis import given +from hypothesis.strategies import just, lists, one_of import polars as pl @@ -83,6 +88,32 @@ def test_cat_to_pandas(dtype: pl.DataType) -> None: ) +@given( + column_type_names=lists( + one_of(just("Object"), just("Int32")), min_size=1, max_size=8 + ) +) +def test_object_to_pandas(column_type_names: list[Literal["Object", "Int32"]]) -> None: + """ + Converting ``pl.Object`` dtype columns to Pandas is handled correctly. + + This edge case is handled with a separate code path than other data types, + so we test it more thoroughly. + """ + column_types = [getattr(pl, name) for name in column_type_names] + data = { + f"col_{i}": [object()] if dtype == pl.Object else [-i] + for i, dtype in enumerate(column_types) + } + df = pl.DataFrame( + data, schema={f"col_{i}": column_types[i] for i in range(len(column_types))} + ) + for pyarrow in [True, False]: + pandas_df = df.to_pandas(use_pyarrow_extension_array=pyarrow) + assert isinstance(pandas_df, pd.DataFrame) + assert pandas_df.to_dict(orient="list") == data + + def test_from_empty_pandas_with_dtypes() -> None: df = pd.DataFrame(columns=["a", "b"]) df["a"] = df["a"].astype(str) @@ -137,3 +168,14 @@ def test_to_pandas_datetime() -> None: result_df = s.to_frame().to_pandas() expected_df = expected_series.to_frame() pd.testing.assert_frame_equal(result_df, expected_df) + + +@pytest.mark.parametrize("use_pyarrow_extension_array", [True, False]) +def test_object_to_pandas_series(use_pyarrow_extension_array: bool) -> None: + values = [object(), [1, 2, 3]] + pd.testing.assert_series_equal( + pl.Series("a", values, dtype=pl.Object).to_pandas( + use_pyarrow_extension_array=use_pyarrow_extension_array + ), + pd.Series(values, dtype=object, name="a"), + ) diff --git a/py-polars/tests/unit/namespaces/test_plot.py b/py-polars/tests/unit/namespaces/test_plot.py index cb4f7c4825b9..34f8964512d8 100644 --- a/py-polars/tests/unit/namespaces/test_plot.py +++ b/py-polars/tests/unit/namespaces/test_plot.py @@ -3,7 +3,6 @@ import pytest import polars as pl -from polars.exceptions import PolarsPanicError # Calling `plot` the first time is slow # https://github.com/pola-rs/polars/issues/13500 @@ -39,8 +38,3 @@ def test_series_hist() -> None: def test_empty_dataframe() -> None: pl.DataFrame({"a": [], "b": []}).plot.scatter(x="a", y="b") - - -def test_unsupported_dtype() -> None: - with pytest.raises(PolarsPanicError): - pl.DataFrame({"a": [{1, 2}], "b": [4]}).plot.scatter(x="a", y="b")